metrics.h 2.34 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#ifndef Metrics_H
#define Metrics_H

#include <prometheus/counter.h>
#include <prometheus/exposer.h>
#include <prometheus/gauge.h>
#include <prometheus/histogram.h>
#include <prometheus/registry.h>
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>

#include "timer.hpp"
// 指标前缀宏定义
#define METRIC_PREFIX "scheduler"
class Metrics;

// 配置结构体
struct MetricsConfig {
  std::string endpoint;
  std::string model_name;  // 模型名称,如 "gpt-4"
  size_t gpu_count;        // GPU数量
};

// Metrics 类,根据配置初始化 Prometheus 指标
class Metrics {
 public:
  // 构造函数传入 MetricsConfig
  Metrics(const MetricsConfig& config);
  ~Metrics();

  // 禁止拷贝和赋值
  Metrics(const Metrics&) = delete;
  Metrics& operator=(const Metrics&) = delete;

  std::function<void(Metrics*)> fn_every_sec;

  // 指标指针
  prometheus::Gauge* uptime_ms;
  prometheus::Histogram* TTFT_ms;
  prometheus::Histogram* TBT_ms;
  prometheus::Histogram* schedule_time;
  prometheus::Gauge* throughput_query;
  prometheus::Gauge* throughput_generated_tokens;
  prometheus::Counter* generated_tokens;
  std::vector<prometheus::Gauge*> gpu_utilization_gauges;

  // 计数器家族
  prometheus::Counter* event_count(const std::string& type);
  prometheus::Counter* query_count(const std::string& status);
  prometheus::Counter* batch_count(const std::string& type);

 private:
  std::shared_ptr<prometheus::Registry> registry_;
  prometheus::Exposer exposer_;

  // 计数器家族
  prometheus::Family<prometheus::Counter>* event_count_family_;
  prometheus::Family<prometheus::Counter>* batch_count_family_;
  prometheus::Family<prometheus::Counter>* query_count_family_;

  // 线程和控制变量用于更新 uptime_ms
  std::thread uptime_thread_;
  std::atomic<bool> stop_uptime_thread_;

  // 启动 uptime 更新线程
  void StartUptimeUpdater();
  // 停止 uptime 更新线程
  void StopUptimeUpdater();

  // 记录程序启动时间
  std::chrono::steady_clock::time_point start_time_;
};

struct HistogramTimerWrapper {
  prometheus::Histogram* histogram;
  Timer timer;
  inline HistogramTimerWrapper(prometheus::Histogram* histogram) : histogram(histogram), timer() { timer.start(); }
  inline ~HistogramTimerWrapper() { histogram->Observe(timer.elapsedMs()); }
};

#endif  // Metrics_H