package backend import ( "container/list" "errors" "os" "os/exec" "regexp" "strconv" "strings" ) type RCCL_BINARY string const ( RCCL_ALL_GATHER RCCL_BINARY = "all_gather_perf" RCCL_ALL_REDUCE RCCL_BINARY = "all_reduce_perf" RCCL_ALL_TO_ALL RCCL_BINARY = "alltoall_perf" RCCL_BROADCASE RCCL_BINARY = "broadcast_perf" RCCL_GATHER RCCL_BINARY = "gather_perf" RCCL_REDUCE RCCL_BINARY = "reduce_perf" RCCL_REDUCE_SCATTER RCCL_BINARY = "reduce_scatter_perf" RCCL_SCATTER RCCL_BINARY = "scatter_perf" RCCL_SEND_RECV RCCL_BINARY = "send_recv_perf" ) type RcclTestAllReducePrefResult struct { DTKPath string `json:"dtk_path"` // dtk 库路径 Args string `json:"args"` // 执行参数 TestVersion string `json:"test_version"` // rccl-tests 版本信息 UseDevice []string `json:"use_device"` // 使用的设备列表 Results []*RcclTestItem `json:"results"` RawOutput string `json:"raw_output,omitempty"` } type RcclTestItem struct { Size uint64 `json:"size"` Count uint64 `json:"count"` Type string `json:"type"` Redop string `json:"redop"` Root int `json:"root"` OutOfPlace Metrics `json:"out_of_place"` InPlace Metrics `json:"in_place"` } type Metrics struct { Time float64 `json:"time"` AlgBW float64 `json:"alg_bw"` BusBW float64 `json:"bus_bw"` Wrong uint32 `json:"wrong"` } func NewRcclTestItem(str []string) *RcclTestItem { if len(str) != 13 { return nil } item := &RcclTestItem{} i, err := strconv.ParseUint(str[0], 10, 64) if err != nil { return nil } item.Size = i i, err = strconv.ParseUint(str[1], 10, 64) if err != nil { return nil } item.Count = i item.Type = str[2] item.Redop = str[3] r, err := strconv.ParseInt(str[4], 10, 10) if err != nil { return nil } item.Root = int(r) t, err := strconv.ParseFloat(str[5], 64) if err != nil { return nil } item.OutOfPlace.Time = t t, err = strconv.ParseFloat(str[6], 64) if err != nil { return nil } item.OutOfPlace.AlgBW = t t, err = strconv.ParseFloat(str[7], 64) if err != nil { return nil } item.OutOfPlace.BusBW = t w, err := strconv.ParseUint(str[8], 10, 32) if err != nil { return nil } item.OutOfPlace.Wrong = uint32(w) t, err = strconv.ParseFloat(str[9], 64) if err != nil { return nil } item.InPlace.Time = t t, err = strconv.ParseFloat(str[10], 64) if err != nil { return nil } item.InPlace.AlgBW = t t, err = strconv.ParseFloat(str[11], 64) if err != nil { return nil } item.InPlace.BusBW = t w, err = strconv.ParseUint(str[12], 10, 32) if err != nil { return nil } item.InPlace.Wrong = uint32(w) return item } var ( ReUselessLine = regexp.MustCompile(`^#\s*$`) ReSharpLine = regexp.MustCompile(`^#.*$`) ReRcclVersion = regexp.MustCompile(`(?mi)^rccl-tests:\s+Version\s+(.+)$`) ReMetricsLine = regexp.MustCompile(`(?mi)^\s*(\d+)\s+(\d+)\s+(\w+)\s+(\w+)\s+((?:-|)\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)\s+([0-9.]+)\s+([0-9.]+)\s+([0-9.]+)\s+(\d+)$`) ReDeviceLineFlag = regexp.MustCompile(`(?mi)^#\s*using\s+devices\s*$`) ReDeviceLine = regexp.MustCompile(`(?mi)^#\s*rank\s+(\d+)\s+pid\s+(\d+)\s+on\s+(\w+)\s+device\s+(\d+)\s+\[([0-9a-zA-Z:.]*)\]\s(\w+)$`) ReDtkPath = regexp.MustCompile(`^(.*dtk[0-9A-Za-z.-]*).*`) ) // RcclTestCheck 检查 rccl-tests 目录及指定二进制文件是否存在,返回二进制文件的完整路径 func RcclTestCheck(rccl_test_path string, binary RCCL_BINARY) (string, error) { // 检查 rccl-tests 目录是否存在 stat, err := os.Stat(rccl_test_path) if err != nil { return "", err } if !stat.IsDir() { return "", os.ErrNotExist } // 检查可执行文件是否存在 exePath := strings.TrimSuffix(rccl_test_path, "/") + "/" + string(binary) exeStat, err := os.Stat(exePath) if err != nil { return "", err } if exeStat.IsDir() { return "", errors.New("it's a dir, not file") } if exeStat.Mode().Perm()&0111 == 0 { return "", errors.New("file is not executable") } return exePath, nil } func GetRcclDtkPath(rccl_test_path string, binary RCCL_BINARY) (string, error) { path, err := RcclTestCheck(rccl_test_path, binary) if err != nil { return "", err } output, err := exec.Command("ldd", path).CombinedOutput() if err != nil { return "", err } str := string(output) lines := strings.Split(str, "\n") for _, v := range lines { if strings.Contains(v, "librccl.so") { parts := strings.Fields(v) for _, part := range parts { if strings.Contains(part, "dtk") && ReDtkPath.MatchString(part) { m := ReDtkPath.FindStringSubmatch(part) if len(m) == 2 { return m[1], nil } } } } } return "", errors.New("librccl.so not found in ldd output") } func AllReducePerf(rccl_test_path string, args string) (*RcclTestAllReducePrefResult, error) { path, err := RcclTestCheck(rccl_test_path, RCCL_ALL_REDUCE) if err != nil { return nil, err } args = strings.Trim(args, " ") var output []byte if args == "" { output, err = exec.Command(path).CombinedOutput() } else { output, err = exec.Command(path, strings.Fields(args)...).CombinedOutput() } if err != nil { return nil, err } str := string(output) res := ParseRcclOutput(str) res.Args = args dp, err := GetRcclDtkPath(rccl_test_path, RCCL_ALL_REDUCE) if err == nil { res.DTKPath = dp } return res, nil } func ParseRcclOutput(output string) *RcclTestAllReducePrefResult { result := RcclTestAllReducePrefResult{} result.UseDevice = make([]string, 0, 8) str := strings.Trim(output, "\n") lines := strings.Split(str, "\n") sharpLines := list.New() nosharpLines := list.New() testItems := make([]*RcclTestItem, 0, 16) for _, v := range lines { if ReUselessLine.MatchString(v) { continue } if ReSharpLine.MatchString(v) { sharpLines.PushBack(v) } else { nosharpLines.PushBack(v) } } cache := make([]string, 0, 16) for e := nosharpLines.Front(); e != nil; e = e.Next() { line, ok := e.Value.(string) if !ok { continue } if ReMetricsLine.MatchString(line) { match := ReMetricsLine.FindStringSubmatch(line) if len(match) == 14 { item := NewRcclTestItem(match[1:]) if item != nil { testItems = append(testItems, item) } } } else { cache = append(cache, line) } } result.Results = testItems for _, v := range cache { if ReRcclVersion.MatchString(v) { match := ReRcclVersion.FindStringSubmatch(v) if len(match) == 2 { result.TestVersion = match[1] } break } } findFlag := false for e := sharpLines.Front(); e != nil; e = e.Next() { line, ok := e.Value.(string) if !ok { continue } if ReDeviceLineFlag.MatchString(line) { findFlag = true continue } if findFlag && ReDeviceLine.MatchString(line) { match := ReDeviceLine.FindStringSubmatch(line) if len(match) == 7 { result.UseDevice = append(result.UseDevice, match[6]) } } else if findFlag { findFlag = false break } } result.RawOutput = output return &result }