Commit 3be6c25c authored by songlinfeng's avatar songlinfeng
Browse files

support dtk-ctk dcu-tracker

parent 6b8eb612
......@@ -5,14 +5,27 @@
package main
import (
"dtk-container-toolkit/internal/dcu-tracker"
"dtk-container-toolkit/internal/runtime"
"os"
//"log"
)
func main() {
//f, _ := os.OpenFile("/var/log/app.log", os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0644)
//log.SetOutput(f)
r := runtime.New()
//log.Println("Log written to file")
err := r.Run(os.Args)
if err != nil {
if err != nil {
// log.Println("failed to run runtime")
dcuTracker, err := dcuTracker.New()
if err != nil {
// log.Println("failed to new dcutracker")
os.Exit(1)
}
dcuTracker.ReleaseDCUs(os.Args[len(os.Args)-1])
// log.Println("releasedcu")
os.Exit(1)
}
}
package dcuTracker
import (
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/disable"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/enable"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/initialize"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/release"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/reset"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker/status"
"dtk-container-toolkit/internal/dcu-tracker"
"dtk-container-toolkit/internal/logger"
"fmt"
"os/user"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
dcuTrackerCmd := cli.Command{
Name: "dcu-tracker",
Usage: "DCU Tracker related commands",
UsageText: `dtk-ctk dcu-tracker [dcu-ids] [accessibility]
Arguments:
dcu-ids Comma-separated list of DCU IDs (comma separated list, range operator, all)
accessibility Must be either 'exclusive' or 'shared'
Examples:
dtk-ctk dcu-tracker 0,1,2 exclusive
dtk-ctk dcu-tracker 0,1-2 shared
dtk-ctk dcu-tracker all shared
OR
dtk-ctk dcu-tracker [command] [options]`,
Before: func(c *cli.Context) error { return m.validateGenOptions(c) },
Action: func(c *cli.Context) error { return m.performAction(c) },
}
dcuTrackerCmd.Subcommands = []*cli.Command{
disable.NewCommand(m.logger),
enable.NewCommand(m.logger),
initialize.NewCommand(m.logger),
reset.NewCommand(m.logger),
release.NewCommand(m.logger),
status.NewCommand(m.logger),
}
return &dcuTrackerCmd
}
func (m command) validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func (m command) performAction(c *cli.Context) error {
if c.Args().Len() == 0 {
return cli.ShowAppHelp(c)
}
if c.Args().Len() > 2 {
return cli.Exit("Error: Missing arguments. Usgae: dcu-tracker <dcu_id> <operation>", 1)
}
gpuIDs := c.Args().Get(0)
operation := c.Args().Get(1)
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create DCU tracker, Error: %v", err)
}
switch operation {
case "exclusive":
dcuTracker.MakeDCUsExclusive(gpuIDs)
case "shared":
dcuTracker.MakeDCUsShared(gpuIDs)
default:
return cli.Exit("Error: Invalid operation. Must be 'exclusive' or 'shared", 1)
}
return nil
}
package disable
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
dcuTrackerDisableCmd := cli.Command{
Name: "disable",
Usage: "Disable the DCU Tracker",
UsageText: "dtk-ctk dcu-tracker disable [options]",
Before: func(c *cli.Context) error { return validateGenOptions(c) },
Action: func(c *cli.Context) error { return performAction(c) },
}
return &dcuTrackerDisableCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create GPU tracker, Error: %v", err)
}
err = dcuTracker.Disable()
if err != nil {
return fmt.Errorf("Failed to Reset GPU Tracker, Error: %v", err)
}
return nil
}
package enable
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
dcuTrackerEnableCmd := cli.Command{
Name: "enable",
Usage: "Enable the DCU Tracker",
UsageText: "dtk-ctk dcu-tracker enable [options]",
Before: func(c *cli.Context) error { return validateGenOptions(c) },
Action: func(c *cli.Context) error { return performAction(c) },
}
return &dcuTrackerEnableCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create DCU tracker, Error: %v", err)
}
err = dcuTracker.Enable()
if err != nil {
return fmt.Errorf("Failed to Reset DCU Tracker, Error: %v", err)
}
return nil
}
package initialize
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
// dcuTrackerInitCmd initializes the DCU Tracker.
dcuTrackerInitCmd := cli.Command{
Name: "init",
Hidden: true,
Usage: "Initialize the DCU Tracker",
UsageText: "dtk-ctk dcu-tracker init [options]",
Before: func(c *cli.Context) error { return validateGenOptions(c) },
Action: func(c *cli.Context) error { return performAction(c) },
}
return &dcuTrackerInitCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create GPU tracker, Error: %v", err)
}
err = dcuTracker.Init()
if err != nil {
return fmt.Errorf("Failed to Initialize GPU Tracker, Error: %v", err)
}
return nil
}
package release
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
dcuTrackerReleaseCmd := cli.Command{
Name: "release",
Hidden: true,
Usage: "Release DCUs used by a container",
UsageText: `dtk-ctk dcu-tracker release [container_id]
Arguments:
container_id container ID of the container
Examples:
dtk-ctk dcu-tracker release a4e19862b4e2a1b04a1f793f346d0411f4a0a3857578c526a25ac6c858168fd8`,
Before: func(c *cli.Context) error {
return validateGenOptions(c)
},
Action: func(c *cli.Context) error {
return performAction(c)
},
}
return &dcuTrackerReleaseCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
if c.Args().Len() == 0 {
return cli.ShowAppHelp(c)
}
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create GPU tracker, Error: %v", err)
}
err = dcuTracker.ReleaseDCUs(c.Args().Get(0))
if err != nil {
return fmt.Errorf("Failed to release GPUs, Error: %v", err)
}
return nil
}
package reset
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (m command) build() *cli.Command {
dcuTrackerResetCmd := cli.Command{
Name: "reset",
Usage: "Reset the DCU Tracker",
UsageText: "dtk-ctk dcu-tracker reset [options]",
Before: func(c *cli.Context) error {
return validateGenOptions(c)
},
Action: func(c *cli.Context) error {
return performAction(c)
},
}
return &dcuTrackerResetCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create DCU tracker, Error: %v", err)
}
err = dcuTracker.Reset()
if err != nil {
return fmt.Errorf("Failed to Reset DCU Tracker, Error: %v", err)
}
return nil
}
package status
import (
dcuTracker "dtk-container-toolkit/internal/dcu-tracker"
"fmt"
"os/user"
"dtk-container-toolkit/internal/logger"
"github.com/urfave/cli/v2"
)
type command struct {
logger logger.Interface
}
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
}
return c.build()
}
func (c *command) build() *cli.Command {
dcuTrackerStatusCmd := cli.Command{
Name: "status",
Usage: "Show Status of DCUs",
UsageText: "dtk-ctk dcu-tracker status [options]",
Before: func(c *cli.Context) error {
return validateGenOptions(c)
},
Action: func(c *cli.Context) error {
return performAction(c)
},
}
return &dcuTrackerStatusCmd
}
func validateGenOptions(c *cli.Context) error {
curUser, err := user.Current()
if err != nil || curUser.Uid != "0" {
return fmt.Errorf("Permission denied: Not running as root")
}
return nil
}
func performAction(c *cli.Context) error {
dcuTracker, err := dcuTracker.New()
if err != nil {
return fmt.Errorf("Failed to create GPU tracker, Error: %v", err)
}
err = dcuTracker.ShowStatus()
if err != nil {
return fmt.Errorf("Failed to show GPUs status, Error: %v", err)
}
return nil
}
......@@ -7,6 +7,7 @@ package main
import (
"dtk-container-toolkit/cmd/dtk-ctk/cdi"
"dtk-container-toolkit/cmd/dtk-ctk/config"
"dtk-container-toolkit/cmd/dtk-ctk/dcu-tracker"
"dtk-container-toolkit/cmd/dtk-ctk/hook"
"dtk-container-toolkit/cmd/dtk-ctk/runtime"
"dtk-container-toolkit/cmd/dtk-ctk/rootless"
......@@ -73,6 +74,7 @@ func main() {
// Define the subcommands
c.Commands = []*cli.Command{
rootless.NewCommand(logger),
dcuTracker.NewCommand(logger),
runtime.NewCommand(logger),
config.NewCommand(logger),
hook.NewCommand(logger),
......
module dtk-container-toolkit
go 1.24
go 1.24.0
toolchain go1.24.6
require (
github.com/gofrs/flock v0.13.0
github.com/opencontainers/runtime-spec v1.2.1
github.com/pelletier/go-toml v1.9.5
github.com/sirupsen/logrus v1.9.3
github.com/urfave/cli/v2 v2.27.5
gopkg.in/ini.v1 v1.67.0
tags.cncf.io/container-device-interface v0.8.1
tags.cncf.io/container-device-interface/specs-go v0.8.0
)
......@@ -14,12 +18,13 @@ require (
require (
github.com/cpuguy83/go-md2man/v2 v2.0.6 // indirect
github.com/fsnotify/fsnotify v1.8.0 // indirect
github.com/kr/text v0.2.0 // indirect
github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 // indirect
github.com/rogpeppe/go-internal v1.14.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/mod v0.23.0 // indirect
golang.org/x/sys v0.30.0 // indirect
gopkg.in/ini.v1 v1.67.0 // indirect
golang.org/x/sys v0.37.0 // indirect
sigs.k8s.io/yaml v1.4.0 // indirect
)
......@@ -2,11 +2,14 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM
github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ=
github.com/cpuguy83/go-md2man/v2 v2.0.6 h1:XJtiaUW6dEEqVuZiMTn1ldk455QWwEIsMIJlo5vtkx0=
github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/fsnotify/fsnotify v1.8.0 h1:dAwr6QBTBZIkG8roQaJjGof0pp0EeF+tNV7YBP3F/8M=
github.com/fsnotify/fsnotify v1.8.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw=
github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
......@@ -14,6 +17,10 @@ github.com/hashicorp/errwrap v1.0.0 h1:hLrqtEDnRye3+sgx6z4qVLNuviH3MR5aQ0ykNJa/U
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo=
github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/mndrix/tap-go v0.0.0-20171203230836-629fa407e90b/go.mod h1:pzzDgJWZ34fGzaAZGFW22KVZDfyrYW+QABMrWnJBnSs=
github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ=
github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0=
......@@ -27,6 +34,8 @@ github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3v
github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
......@@ -35,8 +44,9 @@ github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVs
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 h1:kdXcSzyDtseVEc4yCz2qF8ZrQvIDBJLl4S1c3GCXmoI=
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635/go.mod h1:hkRG7XYTFWNJGYcbNJQlaLq0fg1yr4J4t/NcTQtrfww=
github.com/urfave/cli v1.19.1/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijbERA=
......@@ -55,10 +65,11 @@ golang.org/x/mod v0.23.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.30.0 h1:QjkSwP/36a20jFYWkSue1YwXzLmsV5Gfq7Eiy72C1uc=
golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
golang.org/x/sys v0.37.0 h1:fdNQudmxPjkdUTPnLn5mdQv7Zwvbvpaxqs831goi9kQ=
golang.org/x/sys v0.37.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA=
gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
......
......@@ -17,8 +17,9 @@ import (
// a map of environment variable to values that can be used to perform lookups
// such as requirements.
type DTK struct {
env map[string]string
mounts []specs.Mount
env map[string]string
mounts []specs.Mount
ContainerId string
}
// NewDTKImageFromSpec creates a DTK image from the input OCI runtime spec.
......
package dcuTracker
import (
"dtk-container-toolkit/internal/hydcu"
"encoding/json"
"fmt"
"os"
"os/signal"
"reflect"
"sort"
"strconv"
"strings"
"syscall"
"time"
"github.com/gofrs/flock"
)
type accessibility int
const (
SHARED_ACCESS accessibility = iota
EXCLUSIVE_ACCESS
)
// Interface for DCU Tracker package
type Interface interface {
// Initialize DCU Tracker
Init() error
// Enable DCU Tracker
Enable() error
// Disable DCU Tracker
Disable() error
// Reset DCU Tracker
Reset() error
// Show DCUs status
ShowStatus() error
// Make specified DCUs exclusive such that they can be used
// by at most one container at any instance
MakeDCUsExclusive(dcus string) error
// Make specified DCUs shared such that they can be used
// by any number of containers at any instance
MakeDCUsShared(dcus string) error
// Reserve DCUs for a container
ReserveDCUs(dcus string, containerId string) ([]int, error)
// Release all DCUs linked to a container
ReleaseDCUs(containerId string) error
}
type dcu_status_t struct {
// UUID of DCU
UUID string `json:"uuid"`
// Partition Type of the DCU
PartitionType string `json:"partitionType"`
// DCU accessibility
Accessibility accessibility `json:"accessibility"`
// Container Ids of the containers to which the DCU is assigned
ContainerIds []string `json:"containerIds"`
}
type dcu_tracker_data_t struct {
//Status of DCU Tracker
Enabled bool `json:"enabled"`
//Status of all DCUs
DCUsStatus map[int]dcu_status_t `json:"dcusStatus"`
// Info of all DCUs
DCUsInfo map[int]hydcu.DeviceInfo `json:"dcusInfo"`
}
// isDCUTrackerInitializedType is the type for functions
// that return if DCU Tracker is initialized
type isDCUTrackerInitializedType func() (bool, error)
// initializeDCUTrackerType is the type for functions that
// initialize DCU Tracker
type initializeDCUTrackerType func() error
// parseGPUsListType is the type for functions that parse
// DCU list strings and returns the valid and invalid DCU Ids
type parseDCUsListType func(string) ([]int, []string, []string, error)
// readDCUTrackerFileType is the type for functions that
// read the DCU Tracker file and return the DCUs status
type readDCUTrackerFileType func() (dcu_tracker_data_t, error)
// writeDCUTrackerFileType is the type for functions that
// write the DCUs status to DCU Tracker file
type writeDCUTrackerFileType func(dcu_tracker_data_t) error
// validateDCUsInfoType is the type for functions that
// validate the DCUs info
type validateDCUsInfoType func(map[int]hydcu.DeviceInfo) (bool, error)
type dcu_tracker_t struct {
// path to DCU Tracker lock file
dcuTrackerLockFile string
// function to check if DCU Tracker is initialized
isDCUTrackerInitialized isDCUTrackerInitializedType
// function to initialize DCU Tracker
initializeDCUTracker initializeDCUTrackerType
// function to parse DCU list strings
parseDCUsList parseDCUsListType
// function to read DCU Tracker file
readDCUTrackerFile readDCUTrackerFileType
// function to write DCU Tracker file
writeDCUTrackerFile writeDCUTrackerFileType
// function to validate DCUs info
validateDCUsInfo validateDCUsInfoType
}
const (
dcuTrackerFile = "/var/log/dcu-tracker.json"
dcuTrackerLockFile = "/var/log/dcu-tracker.lock"
)
func setupSignalHandler(lock *flock.Flock) {
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM)
go func() {
sig := <-c
fmt.Printf("Received signal: %v. Cleaning up...\n", sig)
if lock != nil {
_ = lock.Unlock()
}
os.Exit(1)
}()
}
func acquireLock(lockFile string) (*flock.Flock, error) {
lock := flock.New(lockFile)
timeout := time.After(10 * time.Second)
tick := time.Tick(100 * time.Millisecond)
for {
select {
case <-timeout:
return nil, fmt.Errorf("Acquiring lock timeout exceeded")
case <-tick:
locker, err := lock.TryLock()
if err != nil {
return nil, fmt.Errorf("Failed to acquire lock, Error: %v", err)
}
if locker {
return lock, nil
}
}
}
}
func parseDCUsList(dcus string) ([]int, []string, []string, error) {
// isHexString checks if a string contains only hexadecimal characters
isHexString := func(s string) bool {
if len(s) == 0 {
return false
}
for _, c := range s {
if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
return false
}
}
return true
}
validDCUs := []int{}
invalidDCUs := []string{}
invalidDCUsRange := []string{}
dcusInfo, err := hydcu.GetHYDCUs()
if err != nil {
return []int{}, []string{}, []string{}, fmt.Errorf("Failed to get DCU info, Error: %v", err)
}
if dcus == "all" || dcus == "All" || dcus == "ALL" {
for i := 0; i < len(dcusInfo); i++ {
validDCUs = append(validDCUs, i)
}
return validDCUs, []string{}, []string{}, nil
}
uuidToDCUIdMap, err := hydcu.GetUniqueIdToDeviceIndexMap()
if err != nil {
fmt.Printf("Failed to get UUID to DCU Id mappings: %v", err)
uuidToDCUIdMap = make(map[string][]int)
}
for _, c := range strings.Split(dcus, ",") {
if strings.HasPrefix(c, "0x") || strings.HasPrefix(c, "0X") ||
(len(c) > 8 && isHexString(c)) {
uuid := strings.ToLower(c)
if !strings.HasPrefix(uuid, "0x") {
uuid = "0x" + uuid
}
if gpuIds, exists := uuidToDCUIdMap[uuid]; exists {
validDCUs = append(validDCUs, gpuIds...)
} else {
uuid = strings.TrimPrefix(uuid, "0x")
if dcuIds, exists := uuidToDCUIdMap[uuid]; exists {
validDCUs = append(validDCUs, dcuIds...)
} else {
invalidDCUs = append(invalidDCUs, c)
}
}
} else if strings.Contains(c, "-") {
devsRange := strings.SplitN(c, "-", 2)
start, err0 := strconv.Atoi(devsRange[0])
end, err1 := strconv.Atoi(devsRange[1])
if err0 != nil || err1 != nil ||
start < 0 || end < 0 || start > end {
invalidDCUsRange = append(invalidDCUsRange, c)
} else {
for i := start; i <= end; i++ {
if i < len(dcusInfo) {
validDCUs = append(validDCUs, i)
} else {
invalidDCUs = append(invalidDCUs, strconv.Itoa(i))
}
}
}
} else {
i, err := strconv.Atoi(c)
if err == nil {
if i >= 0 && i < len(dcusInfo) {
validDCUs = append(validDCUs, i)
} else {
invalidDCUs = append(invalidDCUs, c)
}
} else {
invalidDCUs = append(invalidDCUs, c)
}
}
}
sort.Ints(validDCUs)
return validDCUs, invalidDCUs, invalidDCUsRange, nil
}
func isDCUTrackerInitialized() (bool, error) {
dcuTrackerInitialized := false
_, err := os.Stat(dcuTrackerFile)
if err == nil {
dcuTrackerInitialized = true
} else {
if !os.IsNotExist(err) {
return false, fmt.Errorf("Error checking file %v, Error:%v", dcuTrackerFile, err)
}
}
return dcuTrackerInitialized, nil
}
func readDCUTrackerFile() (dcu_tracker_data_t, error) {
file, err := os.Open(dcuTrackerFile)
if err != nil {
return dcu_tracker_data_t{DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)},
fmt.Errorf("Error opening file, Error: %v", err)
}
defer file.Close()
var dcuTrackerData dcu_tracker_data_t
decoder := json.NewDecoder(file)
if err := decoder.Decode(&dcuTrackerData); err != nil {
return dcu_tracker_data_t{DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)},
fmt.Errorf("Failed to decode JSON, Error: %v", err)
}
return dcuTrackerData, nil
}
func writeDCUTrackerFile(dcuTrackerData dcu_tracker_data_t) error {
tempPath := dcuTrackerFile + ".tmp"
tempFile, err := os.Create(tempPath)
if err != nil {
return fmt.Errorf("Error creating temp file, Error: %v", err)
}
encoder := json.NewEncoder(tempFile)
if err := encoder.Encode(dcuTrackerData); err != nil {
tempFile.Close()
os.Remove(tempPath)
return fmt.Errorf("Error encoding JSON to temp file, Error: %v", err)
}
if err := tempFile.Sync(); err != nil {
tempFile.Close()
os.Remove(tempPath)
return fmt.Errorf("Error syncing temp file: %v", err)
}
tempFile.Close()
if err := os.Rename(tempPath, dcuTrackerFile); err != nil {
return fmt.Errorf("Error renaming temp file: %v", err)
}
return nil
}
func initializeDCUTracker() error {
dcusInfo, err := hydcu.GetHYDCUs()
if err != nil {
return fmt.Errorf("Failed to get HY DCUs info, Error: %v", err)
}
uuidToDCUIdMap, err := hydcu.GetUniqueIdToDeviceIndexMap()
if err != nil {
uuidToDCUIdMap = make(map[string][]int) // Continue with empty map
}
dcuIdToUUIDMap := make(map[int]string)
for uuid, dcuIds := range uuidToDCUIdMap {
if strings.HasPrefix(uuid, "0x") || strings.HasPrefix(uuid, "0X") {
uuid = uuid[2:]
}
uuid = "0x" + strings.ToUpper(uuid)
for _, dcuId := range dcuIds {
dcuIdToUUIDMap[dcuId] = uuid
}
}
dcuTrackerData := dcu_tracker_data_t{Enabled: false, DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)}
for dcuId, dcuInfo := range dcusInfo {
dcuTrackerData.DCUsInfo[dcuId] = dcuInfo
dcuTrackerData.DCUsStatus[dcuId] = dcu_status_t{
UUID: dcuIdToUUIDMap[dcuId],
PartitionType: dcusInfo[dcuId].PartitionType,
Accessibility: SHARED_ACCESS,
ContainerIds: []string{},
}
}
return writeDCUTrackerFile(dcuTrackerData)
}
func validateDCUsInfo(savedDCUsInfo map[int]hydcu.DeviceInfo) (bool, error) {
tempDCUsInfo, err := hydcu.GetHYDCUs()
if err != nil {
return false, fmt.Errorf("Failed to get HY DCUs info, Error: %v", err)
}
currentDCUsInfo := make(map[int]hydcu.DeviceInfo)
for dcuId, dcuInfo := range tempDCUsInfo {
currentDCUsInfo[dcuId] = dcuInfo
}
equal := reflect.DeepEqual(savedDCUsInfo, currentDCUsInfo)
if equal != true {
fmt.Printf("DCUs info is invalid. Please reset DCU Tracker.\n")
return false, nil
}
return true, nil
}
func (dcuTracker *dcu_tracker_t) Init() (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("Init lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in Init: %v", r)
}
}()
err = dcuTracker.initializeDCUTracker()
if err != nil {
return fmt.Errorf("Failed to initialize GPU Tracker, Error: %v", err)
}
return nil
}
func (dcuTracker *dcu_tracker_t) Enable() (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("Enable lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in Enable: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
return fmt.Errorf("Failed to check if DCU Tracker is initialized, Error: %v\n", err)
}
if !dcuTrackerInitialized {
err := dcuTracker.initializeDCUTracker()
if err != nil {
return err
}
}
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to show DCU Tracker status, Error: %v\n", err)
return err
}
if dcusTrackerData.Enabled {
fmt.Printf("DCU Tracker is already enabled\n")
return nil
}
err = dcuTracker.initializeDCUTracker()
if err != nil {
fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err)
return err
}
dcusTrackerData, err = dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err)
return err
}
dcusTrackerData.Enabled = true
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err)
return err
}
fmt.Printf("DCU Tracker has been enabled\n")
return nil
}
func (dcuTracker *dcu_tracker_t) Disable() (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("Disable lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in Disable: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
if !dcuTrackerInitialized {
err := dcuTracker.initializeDCUTracker()
if err != nil {
fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err)
return err
}
} else {
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err)
return err
}
dcusTrackerData.Enabled = false
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err)
return err
}
}
fmt.Printf("DCU Tracker has been disabled\n")
return nil
}
func (dcuTracker *dcu_tracker_t) Reset() (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("Reset lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in Reset: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
dcuTrackerEnabled := false
if !dcuTrackerInitialized {
err := dcuTracker.initializeDCUTracker()
if err != nil {
fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err)
return err
}
} else {
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err)
return err
}
dcuTrackerEnabled = dcusTrackerData.Enabled
err = dcuTracker.initializeDCUTracker()
if err != nil {
fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err)
return err
}
dcusTrackerData, err = dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err)
return err
}
if dcuTrackerEnabled == true {
dcusTrackerData.Enabled = true
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err)
return err
}
}
}
fmt.Printf("DCU Tracker has been reset\n")
if dcuTrackerEnabled {
fmt.Printf("Since DCU Tracker was enabled, it is recommended to stop and restart running containers to get the most accurate GPU Tracker status\n")
}
return nil
}
func (dcuTracker *dcu_tracker_t) ShowStatus() (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("ShowStatus lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in ShowStatus: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
if !dcuTrackerInitialized {
err := dcuTracker.initializeDCUTracker()
if err != nil {
return err
}
}
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to show DCU Tracker status, Error: %v\n", err)
return err
}
if dcusTrackerData.Enabled == false {
fmt.Printf("DCU Tracker is disabled\n")
return nil
}
result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo)
if err != nil || result != true {
return err
}
fmt.Println(strings.Repeat("-", 120))
fmt.Printf("%-10s%-25s%-20s%-65s\n", "GPU Id", "UUID", "Accessibility", "Container Ids")
fmt.Println(strings.Repeat("-", 120))
for dcuId := 0; dcuId < len(dcusTrackerData.DCUsStatus); dcuId++ {
var accessibility string
switch dcusTrackerData.DCUsStatus[dcuId].Accessibility {
case SHARED_ACCESS:
accessibility = "Shared"
case EXCLUSIVE_ACCESS:
accessibility = "Exclusive"
default:
fmt.Printf("Invalid accessibility value %v\n", dcusTrackerData.DCUsStatus[dcuId].Accessibility)
break
}
if len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) > 0 {
for idx, id := range dcusTrackerData.DCUsStatus[dcuId].ContainerIds {
if idx == 0 {
fmt.Printf("%-10v%-25s%-20v%-65v\n", dcuId, dcusTrackerData.DCUsStatus[dcuId].UUID, accessibility, id)
} else {
fmt.Printf("%-10v%-25v%-20v%-65v\n", "", "", "", id)
}
}
} else {
fmt.Printf("%-10v%-25v%-20v%-65v\n", dcuId, dcusTrackerData.DCUsStatus[dcuId].UUID, accessibility, "None")
}
}
return nil
}
func (dcuTracker *dcu_tracker_t) MakeDCUsExclusive(dcus string) (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("MakeDCUsExclusive lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in MakeDCUsExclusive: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
if !dcuTrackerInitialized {
err := dcuTracker.initializeDCUTracker()
if err != nil {
return err
}
}
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to make DCUs exclusive, Error: %v\n", err)
return err
}
if dcusTrackerData.Enabled == false {
fmt.Printf("DCU Tracker is disabled\n")
return nil
}
result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo)
if err != nil || result != true {
return err
}
validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus)
if err != nil {
fmt.Printf("Failed to parse DCUs list, Error: %v\n", err)
return err
}
dcusMadeExclusive := []int{}
dcusNotMadeExclusive := []int{}
for _, dcuId := range validDCUs {
if len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) < 2 {
dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{
UUID: dcusTrackerData.DCUsStatus[dcuId].UUID,
PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType,
Accessibility: EXCLUSIVE_ACCESS,
ContainerIds: dcusTrackerData.DCUsStatus[dcuId].ContainerIds,
}
dcusMadeExclusive = append(dcusMadeExclusive, dcuId)
} else {
dcusNotMadeExclusive = append(dcusNotMadeExclusive, dcuId)
}
}
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to make DCUs exclusive, Error: %v\n", err)
return err
}
if len(dcusMadeExclusive) > 0 {
fmt.Printf("DCUs %v have been made exclusive\n", dcusMadeExclusive)
}
if len(dcusNotMadeExclusive) > 0 {
fmt.Printf("DCUs %v have not been made exclusive because more than one container is currently using it\n", dcusNotMadeExclusive)
}
if len(invalidDCUsRange) > 0 {
fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange)
}
if len(invalidDCUs) > 0 {
fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs)
}
return nil
}
func (dcuTracker *dcu_tracker_t) MakeDCUsShared(dcus string) (err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("MakeDCUsShared lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in MakeDCUsShared: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
if !dcuTrackerInitialized {
err = dcuTracker.initializeDCUTracker()
if err != nil {
return err
}
}
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to make DCUs %v shared, Error: %v\n", dcus, err)
return err
}
if dcusTrackerData.Enabled == false {
fmt.Printf("DCU Tracker is disabled\n")
return nil
}
result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo)
if err != nil || result != true {
return err
}
validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus)
if err != nil {
fmt.Printf("Failed to parse DCUs list %v, Error: %v\n", dcus, err)
return err
}
for _, dcuId := range validDCUs {
dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{
UUID: dcusTrackerData.DCUsStatus[dcuId].UUID,
PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType,
Accessibility: SHARED_ACCESS,
ContainerIds: dcusTrackerData.DCUsStatus[dcuId].ContainerIds,
}
}
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to make DCUs shared, Error: %v\n", err)
return err
}
if len(validDCUs) > 0 {
fmt.Printf("DCUs %v have been made shared\n", validDCUs)
}
if len(invalidDCUsRange) > 0 {
fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange)
}
if len(invalidDCUs) > 0 {
fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs)
}
return nil
}
func (dcuTracker *dcu_tracker_t) ReserveDCUs(dcus string, containerId string) (allocatedDCUs []int, err error) {
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return nil, fmt.Errorf("ReserveDCUs lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in ReserveDCUs: %v", r)
allocatedDCUs = []int{}
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return []int{}, err
}
if !dcuTrackerInitialized {
err = dcuTracker.initializeDCUTracker()
if err != nil {
return []int{}, err
}
}
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to reserve DCUs %v, Error:%v\n", dcus, err)
return []int{}, err
}
validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus)
if err != nil {
fmt.Printf("Failed to parse DCUs list %v, Error: %v\n", dcus, err)
return []int{}, err
}
if len(invalidDCUsRange) > 0 {
fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange)
}
if len(invalidDCUs) > 0 {
fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs)
}
if dcusTrackerData.Enabled == false {
return validDCUs, nil
}
result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo)
if err != nil || result != true {
return []int{}, fmt.Errorf("DCUs info is invalid, Please reset DCU Tracker.\n")
}
var unavailableDCUs []int
for _, dcuId := range validDCUs {
if dcusTrackerData.DCUsStatus[dcuId].Accessibility == SHARED_ACCESS ||
(dcusTrackerData.DCUsStatus[dcuId].Accessibility == EXCLUSIVE_ACCESS &&
len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) == 0) {
dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{
UUID: dcusTrackerData.DCUsStatus[dcuId].UUID,
PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType,
Accessibility: dcusTrackerData.DCUsStatus[dcuId].Accessibility,
ContainerIds: append(dcusTrackerData.DCUsStatus[dcuId].ContainerIds, containerId),
}
allocatedDCUs = append(allocatedDCUs, dcuId)
} else {
unavailableDCUs = append(unavailableDCUs, dcuId)
}
}
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to reserve DCUs %v, Error:%v\n", validDCUs, err)
return []int{}, err
}
if len(allocatedDCUs) > 0 {
fmt.Printf("DCUs %v allocated\n", allocatedDCUs)
}
if len(unavailableDCUs) > 0 {
fmt.Printf("DCUs %v are exclusive and already in use\n", unavailableDCUs)
return []int{}, fmt.Errorf("DCUs %v are exclusive and already in use\n", unavailableDCUs)
}
return allocatedDCUs, nil
}
func (dcuTracker *dcu_tracker_t) ReleaseDCUs(containerId string) (err error) {
removeContainerId := func(containerId string, containerIds []string) ([]string, bool) {
for idx, id := range containerIds {
if id == containerId {
return append(containerIds[:idx], containerIds[idx+1:]...), true
}
}
return containerIds, false
}
lock, err := acquireLock(dcuTracker.dcuTrackerLockFile)
if err != nil {
return fmt.Errorf("ReleaseGPUs lock failed: %v", err)
}
defer func() {
if lock != nil {
_ = lock.Unlock()
}
}()
setupSignalHandler(lock)
defer func() {
if r := recover(); r != nil {
err = fmt.Errorf("Panic in ReleaseDCUs: %v", r)
}
}()
dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized()
if err != nil {
fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err)
return err
}
if dcuTrackerInitialized {
dcusTrackerData, err := dcuTracker.readDCUTrackerFile()
if err != nil {
fmt.Printf("Failed to release DCUs used by container %v, Error: %v\n", containerId, err)
return err
}
var releasedDCUs []int
for dcuId, _ := range dcusTrackerData.DCUsStatus {
containerIds, released := removeContainerId(containerId, dcusTrackerData.DCUsStatus[dcuId].ContainerIds)
if released {
dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{
UUID: dcusTrackerData.DCUsStatus[dcuId].UUID,
PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType,
Accessibility: dcusTrackerData.DCUsStatus[dcuId].Accessibility,
ContainerIds: containerIds,
}
releasedDCUs = append(releasedDCUs, dcuId)
}
}
err = dcuTracker.writeDCUTrackerFile(dcusTrackerData)
if err != nil {
fmt.Printf("Failed to release DCUs used by container %v, Error: %v\n", containerId, err)
return err
}
fmt.Printf("Released DCUs %v used by container %v\n", releasedDCUs, containerId)
}
return nil
}
func New() (Interface, error) {
dcuTracker := &dcu_tracker_t{
dcuTrackerLockFile: dcuTrackerLockFile,
isDCUTrackerInitialized: isDCUTrackerInitialized,
initializeDCUTracker: initializeDCUTracker,
parseDCUsList: parseDCUsList,
readDCUTrackerFile: readDCUTrackerFile,
writeDCUTrackerFile: writeDCUTrackerFile,
validateDCUsInfo: validateDCUsInfo,
}
return dcuTracker, nil
}
......@@ -5,6 +5,7 @@
package discover
import (
"dtk-container-toolkit/internal/config/image"
"dtk-container-toolkit/internal/info/drm"
"dtk-container-toolkit/internal/logger"
"dtk-container-toolkit/internal/lookup"
......@@ -81,7 +82,7 @@ func (s selectDeviceByPath) HookIsSelected(Hook) bool {
}
// NewCommonHCUDiscoverer creates a discoverer for the mounts required by HCU.
func NewCommonHCUDiscoverer(logger logger.Interface, dtkCDIHookPath string, driver *root.Driver, isMount bool) (Discover, error) {
func NewCommonHCUDiscoverer(logger logger.Interface, dtkCDIHookPath string, driver *root.Driver, isMount bool, containerImage image.DTK) (Discover, error) {
metaDevices := NewCharDeviceDiscoverer(
logger,
driver.Root,
......@@ -114,11 +115,18 @@ func NewCommonHCUDiscoverer(logger logger.Interface, dtkCDIHookPath string, driv
linkHook = CreateSymlinkHook(dtkCDIHookPath, []string{"/usr/local/hyhal::/opt/hyhal"})
}
var trackHook Hook
value := containerImage.Getenv(image.EnvVarDTKVisibleDevices)
if len(value) > 0 {
trackHook = CreateTrackHook(dtkCDIHookPath, containerImage.ContainerId)
}
d := Merge(
metaDevices,
libraries,
NewUserGroupDiscover(logger),
linkHook,
trackHook,
)
return d, nil
}
......@@ -49,6 +49,23 @@ func CreateSymlinkHook(dtkCDIHookPath string, links []string) Hook {
)
}
func CreateTrackHook(dtkCDIHookPath string, containerId string) Hook {
return CreateDtkTrackHook(
dtkCDIHookPath,
"dcu-tracker",
"release",
containerId,
)
}
func CreateDtkTrackHook(path string, s string, s2 string, id string) Hook {
return Hook{
Lifecycle: cdi.PoststopHook,
Path: path,
Args: []string{"dtk-ctk", s, s2, id},
}
}
// CreateDtkCDIHook creates a hook which invokes the DTK Container CLI hook subcommand.
func CreateDtkCDIHook(dtkCDIHookPath string, hookName string, additionalArgs ...string) Hook {
return cdiHook(dtkCDIHookPath).Create(hookName, additionalArgs...)
......
package hydcu
import (
"bufio"
"fmt"
"io/ioutil"
"math"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
)
type DeviceInfo struct {
DrmDevices []string
PartitionType string
}
type FileSystem interface {
Stat(name string) (os.FileInfo, error)
Glob(pattern string) ([]string, error)
ReadFile(name string) ([]byte, error)
GetDeviceStat(dev string, format string) (string, error)
}
type DefaultFS struct{}
var defaultFS FileSystem = &DefaultFS{}
func (fs *DefaultFS) Stat(name string) (os.FileInfo, error) { return os.Stat(name) }
func (fs *DefaultFS) Glob(pattern string) ([]string, error) { return filepath.Glob(pattern) }
func (fs *DefaultFS) ReadFile(name string) ([]byte, error) { return os.ReadFile(name) }
func (fs *DefaultFS) GetDeviceStat(dev string, format string) (string, error) {
out, err := exec.Command("stat", "-c", format, dev).Output()
if err != nil {
fmt.Println("stat failed for %v: Error %v", dev, err)
return "", err
}
return strings.TrimSpace(string(out)), nil
}
// GetHYDCUs return the list of all the DCU devices on the system.
func GetHYDCUs() ([]DeviceInfo, error) { return GetHyDCUsWithFS(defaultFS) }
func GetHyDCUsWithFS(fs FileSystem) ([]DeviceInfo, error) {
if _, err := fs.Stat("/sys/module/hydcu/drivers/"); err != nil {
return nil, err
}
renderDevIds := GetDevIdsFromTopology(fs)
// Map to store devices by unique_id to maintain grouping
uniqueIdDevices := make(map[string][]DeviceInfo)
var uniqueIds []string // To maintain order
pciDevs, err := fs.Glob("/sys/module/hydcu/drivers/pci:hydcu/[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]:*")
if err != nil {
fmt.Println("Failed to find hydcu driver directories: %v", err)
return nil, err
}
// Process platform devices for partitions
platformDevs, _ := fs.Glob("/sys/devices/platform/hydcu_xcp_*")
// Combine aboth PCI and platform devices
allDevs := append(pciDevs, platformDevs...)
//Process all devices using the same logic
for _, path := range allDevs {
computePartitionFile := filepath.Join(path, "current_compute_partition")
memoryPartitionFile := filepath.Join(path, "current_memory_partition")
computePartitionType, memoryPartitionType, combinedPartitionType := "", "", ""
// Read the compute partition
if data, err := ioutil.ReadFile(computePartitionFile); err == nil {
computePartitionType = strings.ToLower(strings.TrimSpace(string(data)))
}
// Read the memory partition
if data, err := ioutil.ReadFile(memoryPartitionFile); err == nil {
memoryPartitionType = strings.ToLower(strings.TrimSpace(string(data)))
}
combinedPartitionType = computePartitionType + "_" + memoryPartitionType
if combinedPartitionType == "_" {
combinedPartitionType = ""
}
drms, err := fs.Glob(path + "/drm/*")
if err != nil {
return nil, err
}
drmDevs := []string{}
renderMinor := 0
for _, drm := range drms {
dev := filepath.Base(drm)
if len(dev) >= 4 && dev[0:4] == "card" || len(dev) >= 7 && dev[0:7] == "renderD" {
drmDevs = append(drmDevs, "/dev/dri/"+dev)
if len(dev) >= 7 && dev[0:7] == "renderD" {
renderMinor, _ = strconv.Atoi(dev[7:])
}
}
}
if len(drmDevs) > 0 && renderMinor > 0 {
if devID, exists := renderDevIds[renderMinor]; exists {
if _, exists := uniqueIdDevices[devID]; !exists {
uniqueIds = append(uniqueIds, devID)
}
uniqueIdDevices[devID] = append(uniqueIdDevices[devID], DeviceInfo{DrmDevices: drmDevs, PartitionType: combinedPartitionType})
}
}
}
// Sort devices within each unique_id group by render minor number
for _, devID := range uniqueIds {
sort.Slice(uniqueIdDevices[devID], func(i, j int) bool {
getRenderID := func(devInfo DeviceInfo) int {
devs := devInfo.DrmDevices
for _, dev := range devs {
baseDev := filepath.Base(dev)
if len(baseDev) >= 7 && strings.HasPrefix(baseDev, "renderD") {
id, _ := strconv.Atoi(strings.TrimPrefix(baseDev, "renderD"))
return id
}
}
return 0
}
return getRenderID(uniqueIdDevices[devID][i]) < getRenderID(uniqueIdDevices[devID][j])
})
}
// Combine all devices maintaining the unique_id order
var devs []DeviceInfo
for _, devID := range uniqueIds {
devs = append(devs, uniqueIdDevices[devID]...)
}
return devs, nil
}
var topoUniqueIdRe = regexp.MustCompile(`unique_id\s(\d+)`)
var renderMinorRe = regexp.MustCompile(`drm_render_minor\s(\d+)`)
// GetDevIdsFromTopology returns a map of render minor numbers to unique_ids
func GetDevIdsFromTopology(fs FileSystem, topoRootParam ...string) map[int]string {
topoRoot := "/sys/class/kfd/kfd"
if len(topoRootParam) == 1 {
topoRoot = topoRootParam[0]
}
renderDevIds := make(map[int]string)
nodeFiles, err := fs.Glob(topoRoot + "/topology/nodes/*/properties")
if err != nil {
return renderDevIds
}
for _, nodeFile := range nodeFiles {
renderMinor, err := ParseTopologyProperties(fs, nodeFile, renderMinorRe)
if err != nil {
continue
}
if renderMinor <= 0 || renderMinor > math.MaxInt32 {
continue
}
devID, err := ParseTopologyPropertiesString(fs, nodeFile, topoUniqueIdRe)
if err != nil {
continue
}
renderDevIds[int(renderMinor)] = devID
}
return renderDevIds
}
// ParseTopologyProperties parses for a property value in kfd topology file as int64
// The format is usually one entry per line <name> <value>.
func ParseTopologyProperties(fs FileSystem, path string, re *regexp.Regexp) (int64, error) {
content, err := fs.ReadFile(path)
if err != nil {
return 0, err
}
scanner := bufio.NewScanner(strings.NewReader(string(content)))
for scanner.Scan() {
matches := re.FindStringSubmatch(scanner.Text())
if matches != nil {
return strconv.ParseInt(matches[1], 0, 64)
}
}
return 0, fmt.Errorf("property not found in %s", path)
}
// ParseTopologyPropertiesString parses for a property value in kfd topology file as string
// The format is usually one entry per line <name> <value>.
func ParseTopologyPropertiesString(fs FileSystem, path string, re *regexp.Regexp) (string, error) {
content, err := fs.ReadFile(path)
if err != nil {
return "", err
}
scanner := bufio.NewScanner(strings.NewReader(string(content)))
for scanner.Scan() {
matches := re.FindStringSubmatch(scanner.Text())
if matches != nil {
return matches[1], nil
}
}
return "", fmt.Errorf("property not found in %s", path)
}
// GetUniqueIdToDeviceIndexMap returns a map of unique_id (as hex string) to device indices
func GetUniqueIdToDeviceIndexMap() (map[string][]int, error) {
return GetUniqueIdToDeviceIndexMapWithFS(defaultFS)
}
// GetUniqueIdToDeviceIndexMapWithFS creates a mapping from unique_id (hex format) to device indices
func GetUniqueIdToDeviceIndexMapWithFS(fs FileSystem) (map[string][]int, error) {
devs, err := GetHyDCUsWithFS(fs)
if err != nil {
return nil, err
}
renderDevIds := GetDevIdsFromTopology(fs)
uniqueIdToIndex := make(map[string][]int)
// Process each device group and assign index
for deviceIndex, deviceGroup := range devs {
// Find the render minor for this device group
for _, device := range deviceGroup.DrmDevices {
// Extract render minor from device path like /dev/dri/renderD128
if strings.Contains(device, "renderD") {
renderMinorStr := strings.TrimPrefix(filepath.Base(device), "renderD")
if renderMinor, err := strconv.Atoi(renderMinorStr); err == nil {
if uniqueId, exists := renderDevIds[renderMinor]; exists {
// Convert decimal unique_id to hex format (without 0x prefix)
if uniqueIdInt, err := strconv.ParseUint(uniqueId, 10, 64); err == nil {
hexUniqueId := fmt.Sprintf("0x%x", uniqueIdInt)
uniqueIdToIndex[hexUniqueId] = append(uniqueIdToIndex[hexUniqueId], deviceIndex)
// Also support without 0x prefix
hexUniqueIdNoPrefix := fmt.Sprintf("%x", uniqueIdInt)
uniqueIdToIndex[hexUniqueIdNoPrefix] = append(uniqueIdToIndex[hexUniqueIdNoPrefix], deviceIndex)
}
}
}
break // Only need one render device per group
}
}
}
return uniqueIdToIndex, nil
}
......@@ -7,6 +7,7 @@ package modifier
import (
"dtk-container-toolkit/internal/config"
"dtk-container-toolkit/internal/config/image"
"dtk-container-toolkit/internal/dcu-tracker"
"dtk-container-toolkit/internal/discover"
"dtk-container-toolkit/internal/logger"
"dtk-container-toolkit/internal/lookup"
......@@ -22,12 +23,23 @@ import (
// The value of the DTK_DRIVER_CAPABILITIES environment variable is checked to determine if this modification should be made.
func NewGraphicsModifier(logger logger.Interface, cfg *config.Config, containerImage image.DTK, driver *root.Driver, isMount bool) (oci.SpecModifier, error) {
dtkCDIHookPath := cfg.DTKCTKConfig.Path
value := containerImage.Getenv(image.EnvVarDTKVisibleDevices)
if len(value) > 0 {
dcuTracker, err := dcuTracker.New()
if err == nil {
_, err = dcuTracker.ReserveDCUs(value, containerImage.ContainerId)
logger.Infof("ReserveDCUs %s", value)
if err != nil {
return nil, fmt.Errorf("failed to reserve DCUs: %v", err)
}
}
}
comDiscoverer, err := discover.NewCommonHCUDiscoverer(
logger,
dtkCDIHookPath,
driver,
isMount,
containerImage,
)
if err != nil {
return nil, fmt.Errorf("failed to create mounts discoverer: %v", err)
......
......@@ -36,7 +36,7 @@ func newDTKContainerRuntime(logger logger.Interface, cfg *config.Config, argv []
return nil, fmt.Errorf("error constructing OCI specification: %v", err)
}
specModifier, err := newSpecModifier(logger, cfg, ociSpec, driver)
specModifier, err := newSpecModifier(logger, cfg, ociSpec, driver, argv[len(argv) - 1])
if err != nil {
return nil, fmt.Errorf("failed to construct OCI spec modifier: %v", err)
}
......@@ -53,7 +53,7 @@ func newDTKContainerRuntime(logger logger.Interface, cfg *config.Config, argv []
}
// newSpecModifier is a factory method that creates constructs an OCI spec modifer based on the provided config.
func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, driver *root.Driver) (oci.SpecModifier, error) {
func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec, driver *root.Driver, containerId string) (oci.SpecModifier, error) {
rawSpec, err := ociSpec.Load()
if err != nil {
return nil, fmt.Errorf("failed to load OCI spec: %v", err)
......@@ -63,7 +63,7 @@ func newSpecModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Sp
if err != nil {
return nil, err
}
image.ContainerId = containerId
mode := info.ResolveAutoMode(logger, cfg.DTKContainerRuntimeConfig.Mode, image)
// We update the mode here so that we can continue passing just the config to other functions.
cfg.DTKContainerRuntimeConfig.Mode = mode
......
......@@ -8,6 +8,7 @@ import (
"dtk-container-toolkit/internal/discover"
"dtk-container-toolkit/internal/edits"
"dtk-container-toolkit/internal/platform-support/dhcu"
"dtk-container-toolkit/internal/config/image"
"dtk-container-toolkit/pkg/c3000cdi/spec"
"dtk-container-toolkit/pkg/go-c3000lib/pkg/device"
"fmt"
......@@ -155,6 +156,7 @@ func (l *c3000smilib) newCommonC3000SmiDiscoverer() (discover.Discover, error) {
l.dtkCDIHookPath,
l.driver,
true,
image.DTK{},
)
return d, err
......
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
*.prof
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment