response_cache.h 7.14 KB
Newer Older
xiabo's avatar
xiabo committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#pragma once

#include <list>
#include <string>
#include <unordered_map>

#include "infer_request.h"
#include "infer_response.h"
#include "model.h"
#include "status.h"

#include <boost/functional/hash.hpp>
#include <boost/interprocess/managed_external_buffer.hpp>

namespace triton { namespace core {

// Assuming CPU memory only for now
struct Output {
  // Output tensor data buffer
  void* buffer_;
  // Size of "buffer" above
  uint64_t buffer_size_ = 0;
  // Name of the output
  std::string name_;
  // Datatype of the output
  inference::DataType dtype_;
  // Shape of the output
  std::vector<int64_t> shape_;
};

struct CacheEntry {
  explicit CacheEntry() {}
  // Point to key in LRU list for maintaining LRU order
  std::list<uint64_t>::iterator lru_iter_;
  // each output buffer = managed_buffer.allocate(size, ...)
  std::vector<Output> outputs_;
};

class RequestResponseCache {
 public:
  ~RequestResponseCache();
  // Create the request/response cache object
  static Status Create(
      uint64_t cache_size, std::unique_ptr<RequestResponseCache>* cache);
  // Hash inference request for cache access and store it in "request" object.
  // This will also be called internally in Lookup/Insert if the request hasn't
  // already stored it's hash. It is up to the user to update the hash in the
  // request if modifying any hashed fields of the request object after storing.
  // Return Status object indicating success or failure.
  Status HashAndSet(InferenceRequest* const request);

  // Lookup 'request' hash in cache and return the inference response in
  // 'response' on cache hit or nullptr on cache miss
  // Return Status object indicating success or failure.
  Status Lookup(
      InferenceResponse* const response, InferenceRequest* const request);
  // Insert response into cache, evict entries to make space if necessary
  // Return Status object indicating success or failure.
  Status Insert(
      const InferenceResponse& response, InferenceRequest* const request);
  // Evict entry from cache based on policy
  // Return Status object indicating success or failure.
  Status Evict();
  // Returns number of items in cache
  size_t NumEntries()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return cache_.size();
  }
  // Returns number of items evicted in cache lifespan
  size_t NumEvictions()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return num_evictions_;
  }
  // Returns number of lookups in cache lifespan, should sum to hits + misses
  size_t NumLookups()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return num_lookups_;
  }
  // Returns number of cache hits in cache lifespan
  size_t NumHits()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return num_hits_;
  }
  // Returns number of cache hits in cache lifespan
  size_t NumMisses()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return num_misses_;
  }
  // Returns the total lookup latency (nanoseconds) of all lookups in cache
  // lifespan
  uint64_t TotalLookupLatencyNs()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return total_lookup_latency_ns_;
  }

  uint64_t TotalInsertionLatencyNs()
  {
    std::lock_guard<std::recursive_mutex> lk(cache_mtx_);
    return total_insertion_latency_ns_;
  }

  // Returns total number of bytes allocated for cache
  size_t TotalBytes()
  {
    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
    return managed_buffer_.get_size();
  }
  // Returns number of free bytes in cache
  size_t FreeBytes()
  {
    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
    return managed_buffer_.get_free_memory();
  }
  // Returns number of bytes in use by cache
  size_t AllocatedBytes()
  {
    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
    return managed_buffer_.get_size() - managed_buffer_.get_free_memory();
  }
  // Returns fraction of bytes allocated over total cache size between [0, 1]
  double TotalUtilization()
  {
    std::lock_guard<std::recursive_mutex> lk(buffer_mtx_);
    return static_cast<double>(AllocatedBytes()) /
           static_cast<double>(TotalBytes());
  }

 private:
  explicit RequestResponseCache(const uint64_t cache_size);
  // Update LRU ordering on lookup
  void UpdateLRU(std::unordered_map<uint64_t, CacheEntry>::iterator&);
  // Build CacheEntry from InferenceResponse
  Status BuildCacheEntry(
      const InferenceResponse& response, CacheEntry* const entry);
  // Build InferenceResponse from CacheEntry
  Status BuildInferenceResponse(
      const CacheEntry& entry, InferenceResponse* const response);
  // Helper function to hash data buffers used by "input"
  Status HashInputBuffers(const InferenceRequest::Input* input, size_t* seed);
  // Helper function to hash each input in "request"
  Status HashInputs(const InferenceRequest& request, size_t* seed);
  // Helper function to hash request and store it in "key"
  Status Hash(const InferenceRequest& request, uint64_t* key);

  // Cache buffer
  void* buffer_;
  // Managed buffer
  boost::interprocess::managed_external_buffer managed_buffer_;
  // key -> CacheEntry containing values and list iterator for LRU management
  std::unordered_map<uint64_t, CacheEntry> cache_;
  // List of keys sorted from most to least recently used
  std::list<uint64_t> lru_;
  // Cache metrics
  size_t num_evictions_ = 0;
  size_t num_lookups_ = 0;
  size_t num_hits_ = 0;
  size_t num_misses_ = 0;
  uint64_t total_lookup_latency_ns_ = 0;
  uint64_t total_insertion_latency_ns_ = 0;
  // Mutex for buffer synchronization
  std::recursive_mutex buffer_mtx_;
  // Mutex for cache synchronization
  std::recursive_mutex cache_mtx_;
};

}}  // namespace triton::core