// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // * Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // * Neither the name of NVIDIA CORPORATION nor the names of its // contributors may be used to endorse or promote products derived // from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma once #include #include #include #include #include #include "backend_model.h" #include "backend_model_instance.h" #include "instance_queue.h" #include "model_config.pb.h" #include "payload.h" #include "status.h" namespace triton { namespace core { // Limits the rate at which requests are dispatched to the model instances class RateLimiter { public: using RateLimiterConfig = inference::ModelRateLimiter; using ResourceMap = std::map>; enum RESOURCE_KIND_KEY { // Key for holding global resources GLOBAL_RESOURCE_KEY = -2, // Key for holding resources per each device PER_DEVICE_RESOURCE_KEY = -1 }; /// Creates a rate limiter object which will funnel the requests to /// the model instances. A typical lifetime of the model instance within /// RateLimiter transition from available -> staged -> allocated -> available. /// The transition from available to staged occurs when a request is /// registered for the model. Depending upon the resource availabilty and /// priority, the RateLimiter will transition an instance to allocated state /// at some point in the future. The staged state is skipped when /// configured to ignore the resource constraints. The cycle in this case /// will be available -> allocated -> available. /// \param ignore_resources_and_priority Whether or not to ignore resource /// constraints and cross-model priority. An available instance is directly /// allocated when true. /// \param resource_map The map to the available resource count provided /// explicitly. /// \return Status object indicating success or failure. static Status Create( const bool ignore_resources_and_priority, const ResourceMap& resource_map, std::unique_ptr* rate_limiter); /// Registers the model instance with the rate limiter. /// \param instance The pointer to the TritonModelInstance object to register /// with the rate limiter. /// \param rate_limiter_config The rate limiter configuration associated with /// the model instance. /// \return Status object indicating success or failure. Status RegisterModelInstance( TritonModelInstance* instance, const RateLimiterConfig& rate_limiter_config); /// Remove model from the set of models being managed by the rate limiter. /// \param model The pointer to TritonModel object to be removed. /// \return Status object indicating success or failure. Status UnregisterModel(const TritonModel* model); /// Returns true if there is a payload slot available for the given model. /// \param model The pointer to TritonModel object to be removed. /// \return slot availability in boolean. bool PayloadSlotAvailable(const TritonModel* model); /// Enqueues the payload to rate limiter for scheduling on the given model. /// \param model The pointer to TritonModel object to be removed. /// \param payload The shared pointer to the payload object. /// \return Status object indicating success or failure. Status EnqueuePayload( const TritonModel* model, std::shared_ptr payload); /// Returns the payload that has been scheduled for the given set of model /// instances. Note that this call is blocking and depends upon the /// availability of payloads in the rate limiter for the triton model /// instance. /// \param instance The pointers to TritonModelInstance objects whose /// payload is being requested. /// \param payload The shared pointer to the payload object. void DequeuePayload( std::deque& instance, std::shared_ptr* payload); /// Returns a new payload object. /// \param op_type The operation type for the payload. /// \param instance Optional field that providess the model instance that must /// be used for the execution of the payload. Default is nullptr which allows /// any model instance to execute the payload. /// \return The shared pointer to a new payload object. std::shared_ptr GetPayload( const Payload::Operation op_type, TritonModelInstance* instance = nullptr); /// Releases the given payload object back to the rate limiter. /// \param payload The payload to release. void PayloadRelease(std::shared_ptr& payload); private: class ModelInstanceContext; class ModelContext; struct PayloadQueue; using StandardReleaseFunc = std::function; using StandardScheduleFunc = std::function; using StandardStageFunc = std::function; // Holds the state of the model instance. class ModelInstanceContext { public: friend class RateLimiter; friend class ResourceManager; enum State { AVAILABLE, STAGED, ALLOCATED, REMOVED }; void Release(); TritonModelInstance* RawInstance() const { return triton_model_instance_; } private: ModelInstanceContext( TritonModelInstance* triton_model_instance, ModelContext* model_context, const RateLimiterConfig& rate_limiter_config, StandardStageFunc OnStage, StandardReleaseFunc OnRelease); const RateLimiterConfig* GetRateLimiterConfig() const { return &rate_limiter_config_; } void MarkAvailable(); double ScaledPriority(); Status Stage(StandardScheduleFunc OnSchedule); Status Allocate(); Status DirectAllocate(StandardScheduleFunc OnSchedule); void RequestRemoval(); void WaitForRemoval(); TritonModelInstance* triton_model_instance_; size_t index_; ModelContext* model_context_; RateLimiterConfig rate_limiter_config_; StandardStageFunc OnStage_; StandardReleaseFunc OnRelease_; std::atomic exec_count_; State state_; bool removal_in_progress_; std::mutex state_mtx_; StandardScheduleFunc OnSchedule_; std::condition_variable cv_; }; class ScaledPriorityComparator { public: bool operator()(ModelInstanceContext* a, ModelInstanceContext* b) { return a->ScaledPriority() > b->ScaledPriority(); } }; using PriorityQueue = std::priority_queue< ModelInstanceContext*, std::vector, ScaledPriorityComparator>; // Holds the active context to a model class ModelContext { public: ModelContext(); Status EnqueueModelInstanceRequest( const StandardScheduleFunc& OnSchedule, TritonModelInstance* triton_model_instance); void AddAvailableInstance(ModelInstanceContext* instance); void StageInstanceIfAvailable(TritonModelInstance* triton_model_instance); void AllocateInstanceIfAvailable(); void AddSpecificRequestQueue(); bool ContainsPendingRequests(int32_t index); void RequestRemoval(); bool isRemovalInProgress() { return removal_in_progress_; } private: bool removal_in_progress_; // Queue holding pending scheduling request std::queue generic_sched_request_queue_; std::vector> specific_sched_request_queues_; std::recursive_mutex sched_request_queue_mtx_; // The set of instances that are available at the moment PriorityQueue avbl_instances_; std::recursive_mutex avbl_instances_mtx_; }; // Manages and keep track of resource allocation to the model instances. class ResourceManager { public: static Status Create( const ResourceMap& resource_map, std::unique_ptr* resource_manager); void AddModelInstance(const ModelInstanceContext* instance); Status RemoveModelInstance(const ModelInstanceContext* instance); Status UpdateResourceLimits(); bool AllocateResources(const ModelInstanceContext* instance); Status ReleaseResources(const ModelInstanceContext* instance); private: ResourceManager(const ResourceMap& resource_map); Status ValidateMaxResources(); Status ParseAndValidateExplicitResources(); ResourceMap explicit_max_resources_; std::map model_resources_; std::mutex model_resources_mtx_; ResourceMap max_resources_; std::mutex max_resources_mtx_; ResourceMap allocated_resources_; std::mutex allocated_resources_mtx_; }; RateLimiter( const bool ignore_resources_and_priority, const ResourceMap& resource_map); void InitializePayloadQueues(const TritonModelInstance* instance); Status DeferPayloadSchedule( const StandardScheduleFunc& OnSchedule, const TritonModel* model, TritonModelInstance* instance = nullptr); void OnStage(ModelInstanceContext* instance_ptr); void OnRelease(ModelInstanceContext* instance_ptr); void AttemptAllocation(); void SchedulePayload( TritonModelInstance* tmi, PayloadQueue* payload_queue, const std::shared_ptr& payload); bool ignore_resources_and_priority_; // Instance context for the models std::map< const TritonModel*, std::vector>> model_instance_ctxs_; std::mutex model_instance_ctx_mtx_; // Running context of the models std::map model_contexts_; std::mutex model_ctx_mtx_; // Holds the model instances that have been staged PriorityQueue staged_instances_; std::recursive_mutex staged_instances_mtx_; // Manager to keep track of the resource allocations std::unique_ptr resource_manager_; // Mutex to serialize Payload [de]allocation std::mutex payload_mu_; // Mutex to serialize Payload Queues deallocation std::mutex payload_queues_mu_; // Keep some number of Payload objects for reuse to avoid the overhead // of creating a Payload for every new request. const size_t max_payload_bucket_count_; std::vector> payload_bucket_; std::deque> payloads_in_use_; struct PayloadQueue { explicit PayloadQueue(size_t max_batch_size, uint64_t max_queue_delay_ns) { queue_.reset(new InstanceQueue(max_batch_size, max_queue_delay_ns)); } std::unique_ptr queue_; std::map> specific_queues_; std::mutex mu_; std::condition_variable cv_; }; std::map> payload_queues_; }; }} // namespace triton::core