package dcuTracker import ( "dtk-container-toolkit/internal/hydcu" "encoding/json" "fmt" "os" "os/signal" "reflect" "sort" "strconv" "strings" "syscall" "time" "github.com/gofrs/flock" ) type accessibility int const ( SHARED_ACCESS accessibility = iota EXCLUSIVE_ACCESS ) // Interface for DCU Tracker package type Interface interface { // Initialize DCU Tracker Init() error // Enable DCU Tracker Enable() error // Disable DCU Tracker Disable() error // Reset DCU Tracker Reset() error // Show DCUs status ShowStatus() error // Make specified DCUs exclusive such that they can be used // by at most one container at any instance MakeDCUsExclusive(dcus string) error // Make specified DCUs shared such that they can be used // by any number of containers at any instance MakeDCUsShared(dcus string) error // Reserve DCUs for a container ReserveDCUs(dcus string, containerId string) ([]int, error) // Release all DCUs linked to a container ReleaseDCUs(containerId string) error } type dcu_status_t struct { // UUID of DCU UUID string `json:"uuid"` // Partition Type of the DCU PartitionType string `json:"partitionType"` // DCU accessibility Accessibility accessibility `json:"accessibility"` // Container Ids of the containers to which the DCU is assigned ContainerIds []string `json:"containerIds"` } type dcu_tracker_data_t struct { //Status of DCU Tracker Enabled bool `json:"enabled"` //Status of all DCUs DCUsStatus map[int]dcu_status_t `json:"dcusStatus"` // Info of all DCUs DCUsInfo map[int]hydcu.DeviceInfo `json:"dcusInfo"` } // isDCUTrackerInitializedType is the type for functions // that return if DCU Tracker is initialized type isDCUTrackerInitializedType func() (bool, error) // initializeDCUTrackerType is the type for functions that // initialize DCU Tracker type initializeDCUTrackerType func() error // parseGPUsListType is the type for functions that parse // DCU list strings and returns the valid and invalid DCU Ids type parseDCUsListType func(string) ([]int, []string, []string, error) // readDCUTrackerFileType is the type for functions that // read the DCU Tracker file and return the DCUs status type readDCUTrackerFileType func() (dcu_tracker_data_t, error) // writeDCUTrackerFileType is the type for functions that // write the DCUs status to DCU Tracker file type writeDCUTrackerFileType func(dcu_tracker_data_t) error // validateDCUsInfoType is the type for functions that // validate the DCUs info type validateDCUsInfoType func(map[int]hydcu.DeviceInfo) (bool, error) type dcu_tracker_t struct { // path to DCU Tracker lock file dcuTrackerLockFile string // function to check if DCU Tracker is initialized isDCUTrackerInitialized isDCUTrackerInitializedType // function to initialize DCU Tracker initializeDCUTracker initializeDCUTrackerType // function to parse DCU list strings parseDCUsList parseDCUsListType // function to read DCU Tracker file readDCUTrackerFile readDCUTrackerFileType // function to write DCU Tracker file writeDCUTrackerFile writeDCUTrackerFileType // function to validate DCUs info validateDCUsInfo validateDCUsInfoType } const ( dcuTrackerFile = "/var/log/dcu-tracker.json" dcuTrackerLockFile = "/var/log/dcu-tracker.lock" ) func setupSignalHandler(lock *flock.Flock) { c := make(chan os.Signal, 1) signal.Notify(c, syscall.SIGINT, syscall.SIGTERM) go func() { sig := <-c fmt.Printf("Received signal: %v. Cleaning up...\n", sig) if lock != nil { _ = lock.Unlock() } os.Exit(1) }() } func acquireLock(lockFile string) (*flock.Flock, error) { lock := flock.New(lockFile) timeout := time.After(10 * time.Second) tick := time.Tick(100 * time.Millisecond) for { select { case <-timeout: return nil, fmt.Errorf("Acquiring lock timeout exceeded") case <-tick: locker, err := lock.TryLock() if err != nil { return nil, fmt.Errorf("Failed to acquire lock, Error: %v", err) } if locker { return lock, nil } } } } func parseDCUsList(dcus string) ([]int, []string, []string, error) { // isHexString checks if a string contains only hexadecimal characters isHexString := func(s string) bool { if len(s) == 0 { return false } for _, c := range s { if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { return false } } return true } validDCUs := []int{} invalidDCUs := []string{} invalidDCUsRange := []string{} dcusInfo, err := hydcu.GetHYDCUs() if err != nil { return []int{}, []string{}, []string{}, fmt.Errorf("Failed to get DCU info, Error: %v", err) } if dcus == "all" || dcus == "All" || dcus == "ALL" { for i := 0; i < len(dcusInfo); i++ { validDCUs = append(validDCUs, i) } return validDCUs, []string{}, []string{}, nil } uuidToDCUIdMap, err := hydcu.GetUniqueIdToDeviceIndexMap() if err != nil { fmt.Printf("Failed to get UUID to DCU Id mappings: %v", err) uuidToDCUIdMap = make(map[string][]int) } for _, c := range strings.Split(dcus, ",") { if strings.HasPrefix(c, "0x") || strings.HasPrefix(c, "0X") || (len(c) > 8 && isHexString(c)) { uuid := strings.ToLower(c) if !strings.HasPrefix(uuid, "0x") { uuid = "0x" + uuid } if gpuIds, exists := uuidToDCUIdMap[uuid]; exists { validDCUs = append(validDCUs, gpuIds...) } else { uuid = strings.TrimPrefix(uuid, "0x") if dcuIds, exists := uuidToDCUIdMap[uuid]; exists { validDCUs = append(validDCUs, dcuIds...) } else { invalidDCUs = append(invalidDCUs, c) } } } else if strings.Contains(c, "-") { devsRange := strings.SplitN(c, "-", 2) start, err0 := strconv.Atoi(devsRange[0]) end, err1 := strconv.Atoi(devsRange[1]) if err0 != nil || err1 != nil || start < 0 || end < 0 || start > end { invalidDCUsRange = append(invalidDCUsRange, c) } else { for i := start; i <= end; i++ { if i < len(dcusInfo) { validDCUs = append(validDCUs, i) } else { invalidDCUs = append(invalidDCUs, strconv.Itoa(i)) } } } } else { i, err := strconv.Atoi(c) if err == nil { if i >= 0 && i < len(dcusInfo) { validDCUs = append(validDCUs, i) } else { invalidDCUs = append(invalidDCUs, c) } } else { invalidDCUs = append(invalidDCUs, c) } } } sort.Ints(validDCUs) return validDCUs, invalidDCUs, invalidDCUsRange, nil } func isDCUTrackerInitialized() (bool, error) { dcuTrackerInitialized := false _, err := os.Stat(dcuTrackerFile) if err == nil { dcuTrackerInitialized = true } else { if !os.IsNotExist(err) { return false, fmt.Errorf("Error checking file %v, Error:%v", dcuTrackerFile, err) } } return dcuTrackerInitialized, nil } func readDCUTrackerFile() (dcu_tracker_data_t, error) { file, err := os.Open(dcuTrackerFile) if err != nil { return dcu_tracker_data_t{DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)}, fmt.Errorf("Error opening file, Error: %v", err) } defer file.Close() var dcuTrackerData dcu_tracker_data_t decoder := json.NewDecoder(file) if err := decoder.Decode(&dcuTrackerData); err != nil { return dcu_tracker_data_t{DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)}, fmt.Errorf("Failed to decode JSON, Error: %v", err) } return dcuTrackerData, nil } func writeDCUTrackerFile(dcuTrackerData dcu_tracker_data_t) error { tempPath := dcuTrackerFile + ".tmp" tempFile, err := os.Create(tempPath) if err != nil { return fmt.Errorf("Error creating temp file, Error: %v", err) } encoder := json.NewEncoder(tempFile) if err := encoder.Encode(dcuTrackerData); err != nil { tempFile.Close() os.Remove(tempPath) return fmt.Errorf("Error encoding JSON to temp file, Error: %v", err) } if err := tempFile.Sync(); err != nil { tempFile.Close() os.Remove(tempPath) return fmt.Errorf("Error syncing temp file: %v", err) } tempFile.Close() if err := os.Rename(tempPath, dcuTrackerFile); err != nil { return fmt.Errorf("Error renaming temp file: %v", err) } return nil } func initializeDCUTracker() error { dcusInfo, err := hydcu.GetHYDCUs() if err != nil { return fmt.Errorf("Failed to get HY DCUs info, Error: %v", err) } uuidToDCUIdMap, err := hydcu.GetUniqueIdToDeviceIndexMap() if err != nil { uuidToDCUIdMap = make(map[string][]int) // Continue with empty map } dcuIdToUUIDMap := make(map[int]string) for uuid, dcuIds := range uuidToDCUIdMap { if strings.HasPrefix(uuid, "0x") || strings.HasPrefix(uuid, "0X") { uuid = uuid[2:] } uuid = "0x" + strings.ToUpper(uuid) for _, dcuId := range dcuIds { dcuIdToUUIDMap[dcuId] = uuid } } dcuTrackerData := dcu_tracker_data_t{Enabled: false, DCUsStatus: make(map[int]dcu_status_t), DCUsInfo: make(map[int]hydcu.DeviceInfo)} for dcuId, dcuInfo := range dcusInfo { dcuTrackerData.DCUsInfo[dcuId] = dcuInfo dcuTrackerData.DCUsStatus[dcuId] = dcu_status_t{ UUID: dcuIdToUUIDMap[dcuId], PartitionType: dcusInfo[dcuId].PartitionType, Accessibility: SHARED_ACCESS, ContainerIds: []string{}, } } return writeDCUTrackerFile(dcuTrackerData) } func validateDCUsInfo(savedDCUsInfo map[int]hydcu.DeviceInfo) (bool, error) { tempDCUsInfo, err := hydcu.GetHYDCUs() if err != nil { return false, fmt.Errorf("Failed to get HY DCUs info, Error: %v", err) } currentDCUsInfo := make(map[int]hydcu.DeviceInfo) for dcuId, dcuInfo := range tempDCUsInfo { currentDCUsInfo[dcuId] = dcuInfo } equal := reflect.DeepEqual(savedDCUsInfo, currentDCUsInfo) if equal != true { fmt.Printf("DCUs info is invalid. Please reset DCU Tracker.\n") return false, nil } return true, nil } func (dcuTracker *dcu_tracker_t) Init() (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("Init lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in Init: %v", r) } }() err = dcuTracker.initializeDCUTracker() if err != nil { return fmt.Errorf("Failed to initialize GPU Tracker, Error: %v", err) } return nil } func (dcuTracker *dcu_tracker_t) Enable() (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("Enable lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in Enable: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { return fmt.Errorf("Failed to check if DCU Tracker is initialized, Error: %v\n", err) } if !dcuTrackerInitialized { err := dcuTracker.initializeDCUTracker() if err != nil { return err } } dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to show DCU Tracker status, Error: %v\n", err) return err } if dcusTrackerData.Enabled { fmt.Printf("DCU Tracker is already enabled\n") return nil } err = dcuTracker.initializeDCUTracker() if err != nil { fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err) return err } dcusTrackerData, err = dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err) return err } dcusTrackerData.Enabled = true err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to enable DCU Tracker, Error: %v\n", err) return err } fmt.Printf("DCU Tracker has been enabled\n") return nil } func (dcuTracker *dcu_tracker_t) Disable() (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("Disable lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in Disable: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } if !dcuTrackerInitialized { err := dcuTracker.initializeDCUTracker() if err != nil { fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err) return err } } else { dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err) return err } dcusTrackerData.Enabled = false err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to disable DCU Tracker, Error: %v\n", err) return err } } fmt.Printf("DCU Tracker has been disabled\n") return nil } func (dcuTracker *dcu_tracker_t) Reset() (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("Reset lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in Reset: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } dcuTrackerEnabled := false if !dcuTrackerInitialized { err := dcuTracker.initializeDCUTracker() if err != nil { fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err) return err } } else { dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err) return err } dcuTrackerEnabled = dcusTrackerData.Enabled err = dcuTracker.initializeDCUTracker() if err != nil { fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err) return err } dcusTrackerData, err = dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err) return err } if dcuTrackerEnabled == true { dcusTrackerData.Enabled = true err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to reset DCU Tracker, Error: %v\n", err) return err } } } fmt.Printf("DCU Tracker has been reset\n") if dcuTrackerEnabled { fmt.Printf("Since DCU Tracker was enabled, it is recommended to stop and restart running containers to get the most accurate GPU Tracker status\n") } return nil } func (dcuTracker *dcu_tracker_t) ShowStatus() (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("ShowStatus lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in ShowStatus: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } if !dcuTrackerInitialized { err := dcuTracker.initializeDCUTracker() if err != nil { return err } } dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to show DCU Tracker status, Error: %v\n", err) return err } if dcusTrackerData.Enabled == false { fmt.Printf("DCU Tracker is disabled\n") return nil } result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo) if err != nil || result != true { return err } fmt.Println(strings.Repeat("-", 120)) fmt.Printf("%-10s%-25s%-20s%-65s\n", "GPU Id", "UUID", "Accessibility", "Container Ids") fmt.Println(strings.Repeat("-", 120)) for dcuId := 0; dcuId < len(dcusTrackerData.DCUsStatus); dcuId++ { var accessibility string switch dcusTrackerData.DCUsStatus[dcuId].Accessibility { case SHARED_ACCESS: accessibility = "Shared" case EXCLUSIVE_ACCESS: accessibility = "Exclusive" default: fmt.Printf("Invalid accessibility value %v\n", dcusTrackerData.DCUsStatus[dcuId].Accessibility) break } if len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) > 0 { for idx, id := range dcusTrackerData.DCUsStatus[dcuId].ContainerIds { if idx == 0 { fmt.Printf("%-10v%-25s%-20v%-65v\n", dcuId, dcusTrackerData.DCUsStatus[dcuId].UUID, accessibility, id) } else { fmt.Printf("%-10v%-25v%-20v%-65v\n", "", "", "", id) } } } else { fmt.Printf("%-10v%-25v%-20v%-65v\n", dcuId, dcusTrackerData.DCUsStatus[dcuId].UUID, accessibility, "None") } } return nil } func (dcuTracker *dcu_tracker_t) MakeDCUsExclusive(dcus string) (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("MakeDCUsExclusive lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in MakeDCUsExclusive: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } if !dcuTrackerInitialized { err := dcuTracker.initializeDCUTracker() if err != nil { return err } } dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to make DCUs exclusive, Error: %v\n", err) return err } if dcusTrackerData.Enabled == false { fmt.Printf("DCU Tracker is disabled\n") return nil } result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo) if err != nil || result != true { return err } validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus) if err != nil { fmt.Printf("Failed to parse DCUs list, Error: %v\n", err) return err } dcusMadeExclusive := []int{} dcusNotMadeExclusive := []int{} for _, dcuId := range validDCUs { if len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) < 2 { dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{ UUID: dcusTrackerData.DCUsStatus[dcuId].UUID, PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType, Accessibility: EXCLUSIVE_ACCESS, ContainerIds: dcusTrackerData.DCUsStatus[dcuId].ContainerIds, } dcusMadeExclusive = append(dcusMadeExclusive, dcuId) } else { dcusNotMadeExclusive = append(dcusNotMadeExclusive, dcuId) } } err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to make DCUs exclusive, Error: %v\n", err) return err } if len(dcusMadeExclusive) > 0 { fmt.Printf("DCUs %v have been made exclusive\n", dcusMadeExclusive) } if len(dcusNotMadeExclusive) > 0 { fmt.Printf("DCUs %v have not been made exclusive because more than one container is currently using it\n", dcusNotMadeExclusive) } if len(invalidDCUsRange) > 0 { fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange) } if len(invalidDCUs) > 0 { fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs) } return nil } func (dcuTracker *dcu_tracker_t) MakeDCUsShared(dcus string) (err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("MakeDCUsShared lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in MakeDCUsShared: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } if !dcuTrackerInitialized { err = dcuTracker.initializeDCUTracker() if err != nil { return err } } dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to make DCUs %v shared, Error: %v\n", dcus, err) return err } if dcusTrackerData.Enabled == false { fmt.Printf("DCU Tracker is disabled\n") return nil } result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo) if err != nil || result != true { return err } validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus) if err != nil { fmt.Printf("Failed to parse DCUs list %v, Error: %v\n", dcus, err) return err } for _, dcuId := range validDCUs { dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{ UUID: dcusTrackerData.DCUsStatus[dcuId].UUID, PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType, Accessibility: SHARED_ACCESS, ContainerIds: dcusTrackerData.DCUsStatus[dcuId].ContainerIds, } } err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to make DCUs shared, Error: %v\n", err) return err } if len(validDCUs) > 0 { fmt.Printf("DCUs %v have been made shared\n", validDCUs) } if len(invalidDCUsRange) > 0 { fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange) } if len(invalidDCUs) > 0 { fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs) } return nil } func (dcuTracker *dcu_tracker_t) ReserveDCUs(dcus string, containerId string) (allocatedDCUs []int, err error) { lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return nil, fmt.Errorf("ReserveDCUs lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in ReserveDCUs: %v", r) allocatedDCUs = []int{} } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return []int{}, err } if !dcuTrackerInitialized { err = dcuTracker.initializeDCUTracker() if err != nil { return []int{}, err } } dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to reserve DCUs %v, Error:%v\n", dcus, err) return []int{}, err } validDCUs, invalidDCUs, invalidDCUsRange, err := dcuTracker.parseDCUsList(dcus) if err != nil { fmt.Printf("Failed to parse DCUs list %v, Error: %v\n", dcus, err) return []int{}, err } if len(invalidDCUsRange) > 0 { fmt.Printf("Ignoring %v DCUs Ranges as they are invalid\n", invalidDCUsRange) } if len(invalidDCUs) > 0 { fmt.Printf("Ignoring %v DCUs as they are invalid\n", invalidDCUs) } if dcusTrackerData.Enabled == false { return validDCUs, nil } result, err := dcuTracker.validateDCUsInfo(dcusTrackerData.DCUsInfo) if err != nil || result != true { return []int{}, fmt.Errorf("DCUs info is invalid, Please reset DCU Tracker.\n") } var unavailableDCUs []int for _, dcuId := range validDCUs { if dcusTrackerData.DCUsStatus[dcuId].Accessibility == SHARED_ACCESS || (dcusTrackerData.DCUsStatus[dcuId].Accessibility == EXCLUSIVE_ACCESS && len(dcusTrackerData.DCUsStatus[dcuId].ContainerIds) == 0) { dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{ UUID: dcusTrackerData.DCUsStatus[dcuId].UUID, PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType, Accessibility: dcusTrackerData.DCUsStatus[dcuId].Accessibility, ContainerIds: append(dcusTrackerData.DCUsStatus[dcuId].ContainerIds, containerId), } allocatedDCUs = append(allocatedDCUs, dcuId) } else { unavailableDCUs = append(unavailableDCUs, dcuId) } } err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to reserve DCUs %v, Error:%v\n", validDCUs, err) return []int{}, err } if len(allocatedDCUs) > 0 { fmt.Printf("DCUs %v allocated\n", allocatedDCUs) } if len(unavailableDCUs) > 0 { fmt.Printf("DCUs %v are exclusive and already in use\n", unavailableDCUs) return []int{}, fmt.Errorf("DCUs %v are exclusive and already in use\n", unavailableDCUs) } return allocatedDCUs, nil } func (dcuTracker *dcu_tracker_t) ReleaseDCUs(containerId string) (err error) { removeContainerId := func(containerId string, containerIds []string) ([]string, bool) { for idx, id := range containerIds { if id == containerId { return append(containerIds[:idx], containerIds[idx+1:]...), true } } return containerIds, false } lock, err := acquireLock(dcuTracker.dcuTrackerLockFile) if err != nil { return fmt.Errorf("ReleaseGPUs lock failed: %v", err) } defer func() { if lock != nil { _ = lock.Unlock() } }() setupSignalHandler(lock) defer func() { if r := recover(); r != nil { err = fmt.Errorf("Panic in ReleaseDCUs: %v", r) } }() dcuTrackerInitialized, err := dcuTracker.isDCUTrackerInitialized() if err != nil { fmt.Printf("Failed to check if DCU Tracker is initialized, Error:%v\n", err) return err } if dcuTrackerInitialized { dcusTrackerData, err := dcuTracker.readDCUTrackerFile() if err != nil { fmt.Printf("Failed to release DCUs used by container %v, Error: %v\n", containerId, err) return err } var releasedDCUs []int for dcuId, _ := range dcusTrackerData.DCUsStatus { containerIds, released := removeContainerId(containerId, dcusTrackerData.DCUsStatus[dcuId].ContainerIds) if released { dcusTrackerData.DCUsStatus[dcuId] = dcu_status_t{ UUID: dcusTrackerData.DCUsStatus[dcuId].UUID, PartitionType: dcusTrackerData.DCUsStatus[dcuId].PartitionType, Accessibility: dcusTrackerData.DCUsStatus[dcuId].Accessibility, ContainerIds: containerIds, } releasedDCUs = append(releasedDCUs, dcuId) } } err = dcuTracker.writeDCUTrackerFile(dcusTrackerData) if err != nil { fmt.Printf("Failed to release DCUs used by container %v, Error: %v\n", containerId, err) return err } fmt.Printf("Released DCUs %v used by container %v\n", releasedDCUs, containerId) } return nil } func New() (Interface, error) { dcuTracker := &dcu_tracker_t{ dcuTrackerLockFile: dcuTrackerLockFile, isDCUTrackerInitialized: isDCUTrackerInitialized, initializeDCUTracker: initializeDCUTracker, parseDCUsList: parseDCUsList, readDCUTrackerFile: readDCUTrackerFile, writeDCUTrackerFile: writeDCUTrackerFile, validateDCUsInfo: validateDCUsInfo, } return dcuTracker, nil }