metrics.h 2.35 KB
Newer Older
1
2
3
#ifndef Metrics_H
#define Metrics_H

4
5
6
#include <atomic>
#include <chrono>
#include <memory>
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#include <prometheus/counter.h>
#include <prometheus/exposer.h>
#include <prometheus/gauge.h>
#include <prometheus/histogram.h>
#include <prometheus/registry.h>
#include <string>
#include <thread>
#include <vector>

#include "timer.hpp"
// 指标前缀宏定义
#define METRIC_PREFIX "scheduler"
class Metrics;

// 配置结构体
struct MetricsConfig {
  std::string endpoint;
24
25
  std::string model_name; // 模型名称,如 "gpt-4"
  size_t gpu_count;       // GPU数量
26
27
28
29
};

// Metrics 类,根据配置初始化 Prometheus 指标
class Metrics {
30
public:
31
  // 构造函数传入 MetricsConfig
32
  Metrics(const MetricsConfig &config);
33
34
35
  ~Metrics();

  // 禁止拷贝和赋值
36
37
  Metrics(const Metrics &) = delete;
  Metrics &operator=(const Metrics &) = delete;
38

39
  std::function<void(Metrics *)> fn_every_sec;
40
41

  // 指标指针
42
43
44
45
46
47
48
49
  prometheus::Gauge *uptime_ms;
  prometheus::Histogram *TTFT_ms;
  prometheus::Histogram *TBT_ms;
  prometheus::Histogram *schedule_time;
  prometheus::Gauge *throughput_query;
  prometheus::Gauge *throughput_generated_tokens;
  prometheus::Counter *generated_tokens;
  std::vector<prometheus::Gauge *> gpu_utilization_gauges;
50
51

  // 计数器家族
52
53
54
  prometheus::Counter *event_count(const std::string &type);
  prometheus::Counter *query_count(const std::string &status);
  prometheus::Counter *batch_count(const std::string &type);
55

56
private:
57
58
59
60
  std::shared_ptr<prometheus::Registry> registry_;
  prometheus::Exposer exposer_;

  // 计数器家族
61
62
63
  prometheus::Family<prometheus::Counter> *event_count_family_;
  prometheus::Family<prometheus::Counter> *batch_count_family_;
  prometheus::Family<prometheus::Counter> *query_count_family_;
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

  // 线程和控制变量用于更新 uptime_ms
  std::thread uptime_thread_;
  std::atomic<bool> stop_uptime_thread_;

  // 启动 uptime 更新线程
  void StartUptimeUpdater();
  // 停止 uptime 更新线程
  void StopUptimeUpdater();

  // 记录程序启动时间
  std::chrono::steady_clock::time_point start_time_;
};

struct HistogramTimerWrapper {
79
  prometheus::Histogram *histogram;
80
  Timer timer;
81
82
83
84
  inline HistogramTimerWrapper(prometheus::Histogram *histogram)
      : histogram(histogram), timer() {
    timer.start();
  }
85
86
87
  inline ~HistogramTimerWrapper() { histogram->Observe(timer.elapsedMs()); }
};

88
#endif // Metrics_H