"vscode:/vscode.git/clone" did not exist on "1ae1c33651e0d064976a9c180f553604770a0dca"
infer_response.h 12.8 KB
Newer Older
xiabo's avatar
xiabo committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
// Copyright 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#include <deque>
#include <functional>
#include <string>
#include <vector>
#include "buffer_attributes.h"
#include "constants.h"
#include "infer_parameter.h"
#include "infer_trace.h"
#include "response_allocator.h"
#include "status.h"
#include "triton/common/model_config.h"
#include "tritonserver_apis.h"

namespace triton { namespace core {

class Model;
class InferenceResponse;
//
// An inference response factory.
//
class InferenceResponseFactory {
 public:
  InferenceResponseFactory() = default;

  InferenceResponseFactory(
      const std::shared_ptr<Model>& model, const std::string& id,
      const ResponseAllocator* allocator, void* alloc_userp,
      TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
      void* response_userp,
      const std::function<void(
          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
      : model_(model), id_(id), allocator_(allocator),
        alloc_userp_(alloc_userp), response_fn_(response_fn),
        response_userp_(response_userp), response_delegator_(delegator)
  {
  }

  const ResponseAllocator* Allocator() { return allocator_; }
  void* AllocatorUserp() { return alloc_userp_; }

  Status SetResponseDelegator(
      const std::function<void(
          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator)
  {
    response_delegator_ = delegator;
    return Status::Success;
  }

  // Create a new response.
  Status CreateResponse(std::unique_ptr<InferenceResponse>* response) const;

  // Send a "null" response with 'flags'.
  Status SendFlags(const uint32_t flags) const;

#ifdef TRITON_ENABLE_TRACING
  const std::shared_ptr<InferenceTraceProxy>& Trace() const { return trace_; }
  void SetTrace(const std::shared_ptr<InferenceTraceProxy>& trace)
  {
    trace_ = trace;
  }
  void ReleaseTrace() { trace_ = nullptr; }
#endif  // TRITON_ENABLE_TRACING

 private:
  // The model associated with this factory. For normal
  // requests/responses this will always be defined and acts to keep
  // the model loaded as long as this factory is live. It may be
  // nullptr for cases where the model itself created the request
  // (like running requests for warmup) and so must protect any uses
  // to handle the nullptr case.
  std::shared_ptr<Model> model_;

  // The ID of the corresponding request that should be included in every
  // response. This is a property that can be optionally provided by the user.
  std::string id_;

  // The response allocator and user pointer. The 'allocator_' is a
  // raw pointer because it is owned by the client, and the client is
  // responsible for ensuring that the lifetime of the allocator
  // extends longer that any request or response that depend on the
  // allocator.
  const ResponseAllocator* allocator_;
  void* alloc_userp_;

  // The response callback function and user pointer.
  TRITONSERVER_InferenceResponseCompleteFn_t response_fn_;
  void* response_userp_;

  // Delegator to be invoked on sending responses.
  std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
      response_delegator_;


#ifdef TRITON_ENABLE_TRACING
  // Inference trace associated with this response.
  std::shared_ptr<InferenceTraceProxy> trace_;
#endif  // TRITON_ENABLE_TRACING
};

//
// An inference response.
//
class InferenceResponse {
 public:
  // Output tensor
  class Output {
   public:
    Output(
        const std::string& name, const inference::DataType datatype,
        const std::vector<int64_t>& shape, const ResponseAllocator* allocator,
        void* alloc_userp)
        : name_(name), datatype_(datatype), shape_(shape),
          allocator_(allocator), alloc_userp_(alloc_userp),
          allocated_buffer_(nullptr)
    {
    }
    Output(
        const std::string& name, const inference::DataType datatype,
        std::vector<int64_t>&& shape, const ResponseAllocator* allocator,
        void* alloc_userp)
        : name_(name), datatype_(datatype), shape_(std::move(shape)),
          allocator_(allocator), alloc_userp_(alloc_userp),
          allocated_buffer_(nullptr)
    {
    }

    ~Output();

    // The name of the output tensor.
    const std::string& Name() const { return name_; }

    // Data type of the output tensor.
    inference::DataType DType() const { return datatype_; }

    // The shape of the output tensor.
    const std::vector<int64_t>& Shape() const { return shape_; }

    BufferAttributes* GetBufferAttributes() { return &buffer_attributes_; }

    // Reshape the output tensor. This function must only be called
    // for outputs that have respace specified in the model
    // configuration.
    void Reshape(
        const bool has_batch_dim, const inference::ModelOutput* output_config);

    // Get information about the buffer allocated for this output
    // tensor's data. If no buffer is allocated 'buffer' will return
    // nullptr and the other returned values will be undefined.
    Status DataBuffer(
        const void** buffer, size_t* buffer_byte_size,
        TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id,
        void** userp) const;

    // Allocate the buffer that should be used for this output
    // tensor's data. 'buffer' must return a buffer of size
    // 'buffer_byte_size'.  'memory_type' acts as both input and
    // output. On input gives the buffer memory type preferred by the
    // caller and on return holds the actual memory type of
    // 'buffer'. 'memory_type_id' acts as both input and output. On
    // input gives the buffer memory type id preferred by the caller
    // and returns the actual memory type id of 'buffer'. Only a
    // single buffer may be allocated for the output at any time, so
    // multiple calls to AllocateDataBuffer without intervening
    // ReleaseDataBuffer call will result in an error.
    Status AllocateDataBuffer(
        void** buffer, const size_t buffer_byte_size,
        TRITONSERVER_MemoryType* memory_type, int64_t* memory_type_id);

    // Release the buffer that was previously allocated by
    // AllocateDataBuffer(). Do nothing if AllocateDataBuffer() has
    // not been called.
    Status ReleaseDataBuffer();

   private:
    DISALLOW_COPY_AND_ASSIGN(Output);
    friend std::ostream& operator<<(
        std::ostream& out, const InferenceResponse::Output& output);

    std::string name_;
    inference::DataType datatype_;
    std::vector<int64_t> shape_;

    // The response allocator and user pointer.
    const ResponseAllocator* allocator_;
    void* alloc_userp_;

    // Information about the buffer allocated by
    // AllocateDataBuffer(). This information is needed by
    // DataBuffer() and ReleaseDataBuffer().
    void* allocated_buffer_;
    BufferAttributes buffer_attributes_;
    void* allocated_userp_;
  };

  // InferenceResponse
  InferenceResponse(
      const std::shared_ptr<Model>& model, const std::string& id,
      const ResponseAllocator* allocator, void* alloc_userp,
      TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
      void* response_userp,
      const std::function<void(
          std::unique_ptr<InferenceResponse>&&, const uint32_t)>& delegator);

  // "null" InferenceResponse is a special instance of InferenceResponse which
  // contains minimal information for calling InferenceResponse::Send,
  // InferenceResponse::NullResponse. nullptr will be passed as response in
  // 'response_fn'.
  InferenceResponse(
      TRITONSERVER_InferenceResponseCompleteFn_t response_fn,
      void* response_userp);

  const std::string& Id() const { return id_; }
  const std::string& ModelName() const;
  int64_t ActualModelVersion() const;
  const Status& ResponseStatus() const { return status_; }

  // The response parameters.
  const std::deque<InferenceParameter>& Parameters() const
  {
    return parameters_;
  }

  // Add an parameter to the response.
  Status AddParameter(const char* name, const char* value);
  Status AddParameter(const char* name, const int64_t value);
  Status AddParameter(const char* name, const bool value);

  // The response outputs.
  const std::deque<Output>& Outputs() const { return outputs_; }

  // Add an output to the response. If 'output' is non-null
  // return a pointer to the newly added output.
  Status AddOutput(
      const std::string& name, const inference::DataType datatype,
      const std::vector<int64_t>& shape, Output** output = nullptr);
  Status AddOutput(
      const std::string& name, const inference::DataType datatype,
      std::vector<int64_t>&& shape, Output** output = nullptr);

  // Get the classification label associated with an output. Return
  // 'label' == nullptr if no label.
  Status ClassificationLabel(
      const Output& output, const uint32_t class_index,
      const char** label) const;

  // Send the response with success status. Calling this function
  // releases ownership of the response object and gives it to the
  // callback function.
  static Status Send(
      std::unique_ptr<InferenceResponse>&& response, const uint32_t flags);

  // Send the response with explicit status. Calling this function
  // releases ownership of the response object and gives it to the
  // callback function.
  static Status SendWithStatus(
      std::unique_ptr<InferenceResponse>&& response, const uint32_t flags,
      const Status& status);

#ifdef TRITON_ENABLE_TRACING
  const std::shared_ptr<InferenceTraceProxy>& Trace() const { return trace_; }
  void SetTrace(const std::shared_ptr<InferenceTraceProxy>& trace)
  {
    trace_ = trace;
  }
  void ReleaseTrace() { trace_ = nullptr; }
#endif  // TRITON_ENABLE_TRACING

 private:
  DISALLOW_COPY_AND_ASSIGN(InferenceResponse);
  friend std::ostream& operator<<(
      std::ostream& out, const InferenceResponse& response);

#ifdef TRITON_ENABLE_TRACING
  Status TraceOutputTensors(
      TRITONSERVER_InferenceTraceActivity activity, const std::string& msg);
#endif  // TRITON_ENABLE_TRACING

  // The model associated with this factory. For normal
  // requests/responses this will always be defined and acts to keep
  // the model loaded as long as this factory is live. It may be
  // nullptr for cases where the model itself created the request
  // (like running requests for warmup) and so must protect any uses
  // to handle the nullptr case.
  std::shared_ptr<Model> model_;

  // The ID of the corresponding request that should be included in
  // every response.
  std::string id_;

  // Error status for the response.
  Status status_;

  // The parameters of the response. Use a deque so that there is no
  // reallocation.
  std::deque<InferenceParameter> parameters_;

  // The result tensors. Use a deque so that there is no reallocation.
  std::deque<Output> outputs_;

  // The response allocator and user pointer.
  const ResponseAllocator* allocator_;
  void* alloc_userp_;

  // The response callback function and user pointer.
  TRITONSERVER_InferenceResponseCompleteFn_t response_fn_;
  void* response_userp_;

  // Delegator to be invoked on sending responses.
  std::function<void(std::unique_ptr<InferenceResponse>&&, const uint32_t)>
      response_delegator_;

  bool null_response_;

#ifdef TRITON_ENABLE_TRACING
  // Inference trace associated with this response.
  std::shared_ptr<InferenceTraceProxy> trace_;
#endif  // TRITON_ENABLE_TRACING
};

std::ostream& operator<<(std::ostream& out, const InferenceResponse& response);
std::ostream& operator<<(
    std::ostream& out, const InferenceResponse::Output& output);

}}  // namespace triton::core