// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // #pragma once #ifdef TRITON_ENABLE_METRICS #include #include #include #include "prometheus/counter.h" #include "prometheus/gauge.h" #include "prometheus/registry.h" #include "prometheus/serializer.h" #include "prometheus/text_serializer.h" #include "response_cache.h" #ifdef TRITON_ENABLE_METRICS_GPU #include #endif // TRITON_ENABLE_METRICS_GPU namespace triton { namespace core { #ifdef TRITON_ENABLE_METRICS_CPU using MemInfo = std::unordered_map; // References: // - htop source: https://stackoverflow.com/a/23376195 // - Linux docs: https://www.kernel.org/doc/Documentation/filesystems/proc.txt // guest/guestnice values are counted in user/nice so we skip parsing them struct CpuInfo { uint64_t user = 0; // normal processes executing in user mode uint64_t nice = 0; // niced processes executing in user mode uint64_t system = 0; // processes executing in kernel mode uint64_t idle = 0; // twiddling thumbs uint64_t iowait = 0; // waiting for I/O to complete uint64_t irq = 0; // servicing interrupts uint64_t softirq = 0; // servicing softirqs uint64_t steal = 0; // involuntary wait }; inline std::istream& operator>>(std::istream& is, CpuInfo& info) { is >> info.user >> info.nice >> info.system >> info.idle >> info.iowait >> info.irq >> info.softirq >> info.steal; return is; } #endif // TRITON_ENABLE_METRICS_CPU #ifdef TRITON_ENABLE_METRICS_GPU struct DcgmMetadata { // DCGM handles for initialization and destruction dcgmHandle_t dcgm_handle_ = 0; dcgmGpuGrp_t groupId_ = 0; // DCGM Flags bool standalone_ = false; // DCGM Fields size_t field_count_ = 0; std::vector fields_; // GPU Device Mapping std::map cuda_ids_to_dcgm_ids_; std::vector available_cuda_gpu_ids_; // Stop attempting metrics if they fail multiple consecutive // times for a device. const int fail_threshold_ = 3; // DCGM Failure Tracking std::vector power_limit_fail_cnt_; std::vector power_usage_fail_cnt_; std::vector energy_fail_cnt_; std::vector util_fail_cnt_; std::vector mem_fail_cnt_; // DCGM Energy Tracking std::vector last_energy_; // Track if DCGM handle initialized successfully bool dcgm_initialized_ = false; }; #endif // TRITON_ENABLE_METRICS_GPU class Metrics { public: // Return the hash value of the labels static size_t HashLabels(const std::map& labels); // Are metrics enabled? static bool Enabled(); // Enable reporting of metrics static void EnableMetrics(); // Enable reporting of GPU metrics static void EnableGPUMetrics(); // Enable reporting of CPU metrics static void EnableCpuMetrics(); // Enable reporting of Cache metrics static void EnableCacheMetrics( std::shared_ptr response_cache); // Start a thread for polling enabled metrics if any static void StartPollingThreadSingleton( std::shared_ptr response_cache); // Set the time interval in secs at which metrics are collected static void SetMetricsInterval(uint64_t metrics_interval_ms); // Get the prometheus registry static std::shared_ptr GetRegistry(); // Get serialized metrics static const std::string SerializedMetrics(); // Get the UUID for a CUDA device. Return true and initialize 'uuid' // if a UUID is found, return false if a UUID cannot be returned. static bool UUIDForCudaDevice(int cuda_device, std::string* uuid); // Metric family counting successful inference requests static prometheus::Family& FamilyInferenceSuccess() { return GetSingleton()->inf_success_family_; } // Metric family counting failed inference requests static prometheus::Family& FamilyInferenceFailure() { return GetSingleton()->inf_failure_family_; } // Metric family counting inferences performed, where a batch-size // 'n' inference request is counted as 'n' inferences static prometheus::Family& FamilyInferenceCount() { return GetSingleton()->inf_count_family_; } // Metric family counting inferences performed, where a batch-size // 'n' inference request is counted as 'n' inferences static prometheus::Family& FamilyInferenceExecutionCount() { return GetSingleton()->inf_count_exec_family_; } // Metric family of cumulative inference request duration, in // microseconds static prometheus::Family& FamilyInferenceRequestDuration() { return GetSingleton()->inf_request_duration_us_family_; } // Metric family of cumulative inference queuing duration, in // microseconds static prometheus::Family& FamilyInferenceQueueDuration() { return GetSingleton()->inf_queue_duration_us_family_; } // Metric family of cumulative inference compute durations, in // microseconds static prometheus::Family& FamilyInferenceComputeInputDuration() { return GetSingleton()->inf_compute_input_duration_us_family_; } static prometheus::Family& FamilyInferenceComputeInferDuration() { return GetSingleton()->inf_compute_infer_duration_us_family_; } static prometheus::Family& FamilyInferenceComputeOutputDuration() { return GetSingleton()->inf_compute_output_duration_us_family_; } // Metric families of per-model response cache metrics static prometheus::Family& FamilyCacheHitCount() { return GetSingleton()->cache_num_hits_model_family_; } static prometheus::Family& FamilyCacheHitLookupDuration() { return GetSingleton()->cache_hit_lookup_duration_us_model_family_; } static prometheus::Family& FamilyCacheMissCount() { return GetSingleton()->cache_num_misses_model_family_; } static prometheus::Family& FamilyCacheMissLookupDuration() { return GetSingleton()->cache_miss_lookup_duration_us_model_family_; } static prometheus::Family& FamilyCacheMissInsertionDuration() { return GetSingleton()->cache_miss_insertion_duration_us_model_family_; } private: Metrics(); virtual ~Metrics(); static Metrics* GetSingleton(); bool InitializeDcgmMetrics(); bool InitializeCpuMetrics(); bool InitializeCacheMetrics( std::shared_ptr response_cache); bool StartPollingThread(std::shared_ptr response_cache); bool PollCacheMetrics(std::shared_ptr response_cache); bool PollDcgmMetrics(); bool PollCpuMetrics(); std::string dcgmValueToErrorMessage(double val); std::string dcgmValueToErrorMessage(int64_t val); std::shared_ptr registry_; std::unique_ptr serializer_; prometheus::Family& inf_success_family_; prometheus::Family& inf_failure_family_; prometheus::Family& inf_count_family_; prometheus::Family& inf_count_exec_family_; prometheus::Family& inf_request_duration_us_family_; prometheus::Family& inf_queue_duration_us_family_; prometheus::Family& inf_compute_input_duration_us_family_; prometheus::Family& inf_compute_infer_duration_us_family_; prometheus::Family& inf_compute_output_duration_us_family_; // Global Response Cache metrics prometheus::Family& cache_num_entries_family_; prometheus::Family& cache_num_lookups_family_; prometheus::Family& cache_num_hits_family_; prometheus::Family& cache_num_misses_family_; prometheus::Family& cache_num_evictions_family_; prometheus::Family& cache_lookup_duration_us_family_; prometheus::Family& cache_insertion_duration_us_family_; prometheus::Family& cache_util_family_; // Gauges for Global Response Cache metrics prometheus::Gauge* cache_num_entries_global_; prometheus::Gauge* cache_num_lookups_global_; prometheus::Gauge* cache_num_hits_global_; prometheus::Gauge* cache_num_misses_global_; prometheus::Gauge* cache_num_evictions_global_; prometheus::Gauge* cache_lookup_duration_us_global_; prometheus::Gauge* cache_insertion_duration_us_global_; prometheus::Gauge* cache_util_global_; // Per-model Response Cache metrics prometheus::Family& cache_num_hits_model_family_; prometheus::Family& cache_hit_lookup_duration_us_model_family_; prometheus::Family& cache_num_misses_model_family_; prometheus::Family& cache_miss_lookup_duration_us_model_family_; prometheus::Family& cache_miss_insertion_duration_us_model_family_; #ifdef TRITON_ENABLE_METRICS_GPU prometheus::Family& gpu_utilization_family_; prometheus::Family& gpu_memory_total_family_; prometheus::Family& gpu_memory_used_family_; prometheus::Family& gpu_power_usage_family_; prometheus::Family& gpu_power_limit_family_; prometheus::Family& gpu_energy_consumption_family_; std::vector gpu_utilization_; std::vector gpu_memory_total_; std::vector gpu_memory_used_; std::vector gpu_power_usage_; std::vector gpu_power_limit_; std::vector gpu_energy_consumption_; DcgmMetadata dcgm_metadata_; #endif // TRITON_ENABLE_METRICS_GPU #ifdef TRITON_ENABLE_METRICS_CPU // Parses "/proc/meminfo" for metrics, currently only supported on Linux. Status ParseMemInfo(MemInfo& info); // Parses "/proc/stat" for metrics, currently only supported on Linux. Status ParseCpuInfo(CpuInfo& info); // Computes CPU utilization between "info_new" and "info_old" values double CpuUtilization(const CpuInfo& info_new, const CpuInfo& info_old); prometheus::Family& cpu_utilization_family_; prometheus::Family& cpu_memory_total_family_; prometheus::Family& cpu_memory_used_family_; prometheus::Gauge* cpu_utilization_; prometheus::Gauge* cpu_memory_total_; prometheus::Gauge* cpu_memory_used_; CpuInfo last_cpu_info_; #endif // TRITON_ENABLE_METRICS_CPU // Thread for polling cache/gpu metrics periodically std::unique_ptr poll_thread_; std::atomic poll_thread_exit_; bool metrics_enabled_; bool gpu_metrics_enabled_; bool cpu_metrics_enabled_; bool cache_metrics_enabled_; bool poll_thread_started_; std::mutex metrics_enabling_; std::mutex poll_thread_starting_; uint64_t metrics_interval_ms_; }; }} // namespace triton::core #endif // TRITON_ENABLE_METRICS