metrics.cpp 5.37 KB
Newer Older
1
2
3
4
#include "metrics.h"
#include <iostream>

// 构造函数
5
Metrics::Metrics(const MetricsConfig &config)
6
    : registry_(std::make_shared<prometheus::Registry>()),
7
      exposer_(config.endpoint), stop_uptime_thread_(false),
8
9
      start_time_(std::chrono::steady_clock::now()) {
  // 定义统一的桶大小,最大为 10000 ms (10 s)
10
11
12
  std::vector<double> common_buckets = {
      0.001, 0.005, 0.01,  0.05,  0.1,    0.5,    1.0,    5.0,
      10.0,  50.0,  100.0, 500.0, 1000.0, 5000.0, 10000.0}; // 毫秒
13
14

  // 注册 TTFT_ms Histogram
15
  auto &TTFT_family = prometheus::BuildHistogram()
16
17
18
19
20
21
                          .Name(std::string(METRIC_PREFIX) + "_TTFT_ms")
                          .Help("Time to first token in milliseconds")
                          .Register(*registry_);
  TTFT_ms = &TTFT_family.Add({{"model", config.model_name}}, common_buckets);

  // 注册 TBT_ms Histogram
22
  auto &TBT_family = prometheus::BuildHistogram()
23
24
25
26
27
28
                         .Name(std::string(METRIC_PREFIX) + "_TBT_ms")
                         .Help("Time between tokens in milliseconds")
                         .Register(*registry_);
  TBT_ms = &TBT_family.Add({{"model", config.model_name}}, common_buckets);

  // 注册 schedule_time Histogram
29
30
31
32
33
34
35
  auto &schedule_time_family =
      prometheus::BuildHistogram()
          .Name(std::string(METRIC_PREFIX) + "_schedule_time_ms")
          .Help("Time to generate schedule in milliseconds")
          .Register(*registry_);
  schedule_time =
      &schedule_time_family.Add({{"model", config.model_name}}, common_buckets);
36
37

  // 注册 generated_tokens Counter
38
39
40
41
42
43
44
  auto &generated_tokens_family =
      prometheus::BuildCounter()
          .Name(std::string(METRIC_PREFIX) + "_generated_tokens_total")
          .Help("Total generated tokens")
          .Register(*registry_);
  generated_tokens =
      &generated_tokens_family.Add({{"model", config.model_name}});
45
46

  // 注册 throughput_query Gauge
47
48
49
50
51
52
53
  auto &throughput_query_family =
      prometheus::BuildGauge()
          .Name(std::string(METRIC_PREFIX) + "_throughput_query")
          .Help("Throughput per second based on queries")
          .Register(*registry_);
  throughput_query =
      &throughput_query_family.Add({{"model", config.model_name}});
54
55

  // 注册 throughput_generated_tokens Gauge
56
57
58
59
60
61
62
  auto &throughput_generated_tokens_family =
      prometheus::BuildGauge()
          .Name(std::string(METRIC_PREFIX) + "_throughput_generated_tokens")
          .Help("Throughput per second based on generated tokens")
          .Register(*registry_);
  throughput_generated_tokens =
      &throughput_generated_tokens_family.Add({{"model", config.model_name}});
63
64

  // 注册 event_count Counter family
65
66
67
68
69
70
71
72
73
74
75
  event_count_family_ =
      &prometheus::BuildCounter()
           .Name(std::string(METRIC_PREFIX) + "_event_count_total")
           .Help("Count of various events")
           .Register(*registry_);

  batch_count_family_ =
      &prometheus::BuildCounter()
           .Name(std::string(METRIC_PREFIX) + "_batch_count_total")
           .Help("Count of various batch by status")
           .Register(*registry_);
76
77

  // 注册 query_count Counter family
78
79
80
81
82
  query_count_family_ =
      &prometheus::BuildCounter()
           .Name(std::string(METRIC_PREFIX) + "_query_count_total")
           .Help("Count of queries by status")
           .Register(*registry_);
83
84

  // 注册 uptime_ms Gauge
85
  auto &uptime_family = prometheus::BuildGauge()
86
87
88
89
90
91
                            .Name(std::string(METRIC_PREFIX) + "_uptime_ms")
                            .Help("Uptime of the scheduler in milliseconds")
                            .Register(*registry_);
  uptime_ms = &uptime_family.Add({{"model", config.model_name}});

  // 注册 GPU 利用率 Gauges
92
93
94
95
96
  auto &gpu_util_family =
      prometheus::BuildGauge()
          .Name(std::string(METRIC_PREFIX) + "_gpu_utilization_ratio")
          .Help("Current GPU utilization ratio (0 to 1)")
          .Register(*registry_);
97
  for (size_t i = 0; i < config.gpu_count; ++i) {
98
99
    gpu_utilization_gauges.push_back(&gpu_util_family.Add(
        {{"gpu_id", std::to_string(i)}, {"model", config.model_name}}));
100
101
102
103
104
105
106
107
108
109
  }

  // 将 Registry 注册到 Exposer 中
  exposer_.RegisterCollectable(registry_);

  // 启动 uptime 更新线程
  StartUptimeUpdater();
}

// 析构函数
110
Metrics::~Metrics() { StopUptimeUpdater(); }
111
112
113
114
115
116

// 启动 uptime 更新线程
void Metrics::StartUptimeUpdater() {
  uptime_thread_ = std::thread([this]() {
    while (!stop_uptime_thread_) {
      auto now = std::chrono::steady_clock::now();
117
118
      std::chrono::duration<double, std::milli> uptime_duration =
          now - start_time_;
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
      uptime_ms->Set(uptime_duration.count());
      // fn_every_sec(this);
      std::this_thread::sleep_for(std::chrono::seconds(1));
    }
  });
}

// 停止 uptime 更新线程
void Metrics::StopUptimeUpdater() {
  stop_uptime_thread_ = true;
  if (uptime_thread_.joinable()) {
    uptime_thread_.join();
  }
}

// 获取 event_count 指标
135
136
prometheus::Counter *Metrics::event_count(const std::string &type) {
  return &event_count_family_->Add({{"type", type}}); // 可根据需要添加更多标签
137
138
139
}

// 获取 query_count 指标
140
141
142
prometheus::Counter *Metrics::query_count(const std::string &status) {
  return &query_count_family_->Add(
      {{"status", status}}); // 可根据需要添加更多标签
143
144
}

145
prometheus::Counter *Metrics::batch_count(const std::string &type) {
146
147
  return &batch_count_family_->Add({{"type", type}});
}