"symphony/cli/requirements.txt" did not exist on "342a58b852e6d66caa7ac4204b08b3d81779b676"
backend_output_responder.h 7.88 KB
Newer Older
xiabo's avatar
xiabo committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// Copyright 2019-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//  * Redistributions of source code must retain the above copyright
//    notice, this list of conditions and the following disclaimer.
//  * Redistributions in binary form must reproduce the above copyright
//    notice, this list of conditions and the following disclaimer in the
//    documentation and/or other materials provided with the distribution.
//  * Neither the name of NVIDIA CORPORATION nor the names of its
//    contributors may be used to endorse or promote products derived
//    from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma once

#include <list>
#include <string>
#include <vector>
#include "triton/backend/backend_common.h"
#include "triton/common/async_work_queue.h"
#include "triton/core/tritonbackend.h"

#ifdef TRITON_ENABLE_GPU
#include <cuda_runtime_api.h>
#endif  // TRITON_ENABLE_GPU

namespace triton { namespace backend {

#ifndef TRITON_ENABLE_GPU
using cudaStream_t = void*;
using cudaEvent_t = void*;
#endif  // !TRITON_ENABLE_GPU

//
// BackendOutputResponder
//
class BackendOutputResponder {
 public:
  // The caller can optionally provide 'event' for internal synchronization
  // instead of using 'stream'.
  explicit BackendOutputResponder(
      TRITONBACKEND_Request** requests, const uint32_t request_count,
      std::vector<TRITONBACKEND_Response*>* responses,
      TRITONBACKEND_MemoryManager* memory_manager,
      const bool first_dim_batching, const bool pinned_enabled,
      cudaStream_t stream, cudaEvent_t event = nullptr,
      bool copy_on_stream = false)
      : need_sync_(false), requests_(requests), request_count_(request_count),
        responses_(responses), memory_manager_(memory_manager),
        first_dim_batching_(first_dim_batching),
        pinned_enabled_(pinned_enabled),
        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
        stream_(stream), event_(event), pending_pinned_byte_size_(0),
        copy_on_stream_(copy_on_stream)
  {
  }

  // Legacy constructor for backwards compatibility. The above
  // constructor should be used for all new cases. The responder needs
  // to know if the model is batching along the first dimension. With
  // this constructor we derive that information from the
  // max_batch_size value instead of having it provided directly as in
  // the above constructor.
  explicit BackendOutputResponder(
      TRITONBACKEND_Request** requests, const uint32_t request_count,
      std::vector<TRITONBACKEND_Response*>* responses, const int max_batch_size,
      TRITONBACKEND_MemoryManager* memory_manager, const bool pinned_enabled,
      cudaStream_t stream, cudaEvent_t event = nullptr,
      bool copy_on_stream = false)
      : need_sync_(false), requests_(requests), request_count_(request_count),
        responses_(responses), memory_manager_(memory_manager),
        first_dim_batching_(max_batch_size >= 1),
        pinned_enabled_(pinned_enabled),
        use_async_cpu_copy_(triton::common::AsyncWorkQueue::WorkerCount() > 1),
        stream_(stream), event_(event), pending_pinned_byte_size_(0),
        copy_on_stream_(copy_on_stream)
  {
  }

  ~BackendOutputResponder();

  // Process all responses for a named output tensor.
  // 'batchn_shape' may be modified by the call.
  void ProcessTensor(
      const std::string& name, const TRITONSERVER_DataType datatype,
      std::vector<int64_t>& batchn_shape, const char* buffer,
      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);

  // Process all responses for a named state tensor. Returns a vector of
  // TRITONBACKEND_State objects that the backend can use to update the state.
  // If TRITONBACKEND_StateUpdate is not called on the vector elements, the
  // state will not be updated.
  // 'batchn_shape' may be modified by the call.
  std::vector<TRITONBACKEND_State*> ProcessStateTensor(
      const std::string& name, const TRITONSERVER_DataType datatype,
      std::vector<int64_t>& batchn_shape, const char* buffer,
      const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id);

  // Process all responses for a batch output and derive its value from
  // 'buffer'.
  void ProcessBatchOutput(
      const std::string& name, const BatchOutput& batch_output,
      const char* buffer, const TRITONSERVER_MemoryType memory_type,
      const int64_t memory_type_id);

  // Finalize processing of all responses for all output
  // tensors. Return true if cudaMemcpyAsync is called, and the caller
  // should call cudaStreamSynchronize (or cudaEventSynchronize on 'event')
  // before using the data.
  bool Finalize();

 private:
  bool FlushPendingPinned(
      const char* tensor_buffer,
      const TRITONSERVER_MemoryType tensor_memory_type,
      const int64_t tensor_memory_type_id);
  bool SetFixedSizeBuffer(
      TRITONBACKEND_Response** response, void* response_state_or_output,
      const std::string& output_name, const size_t tensor_byte_size,
      const size_t tensor_offset, const char* tensor_buffer,
      const TRITONSERVER_MemoryType tensor_memory_type,
      const int64_t tensor_memory_type_id,
      const TRITONSERVER_MemoryType use_pinned_memory_type, bool state);

  struct OutputData {
    OutputData(
        const std::string& name, void* buffer, const size_t buffer_byte_size,
        const TRITONSERVER_MemoryType memory_type, const int64_t memory_type_id)
        : name_(name), buffer_(buffer), buffer_byte_size_(buffer_byte_size),
          memory_type_(memory_type), memory_type_id_(memory_type_id)
    {
    }
    const std::string name_;
    void* buffer_;
    const size_t buffer_byte_size_;
    const TRITONSERVER_MemoryType memory_type_;
    const int64_t memory_type_id_;
  };

  bool need_sync_;
  TRITONBACKEND_Request** requests_;
  const uint32_t request_count_;
  std::vector<TRITONBACKEND_Response*>* responses_;
  TRITONBACKEND_MemoryManager* memory_manager_;
  const bool first_dim_batching_;
  const bool pinned_enabled_;
  const bool use_async_cpu_copy_;
  cudaStream_t stream_;
  cudaEvent_t event_;

  using ResponsesList =
      std::list<std::pair<TRITONBACKEND_Response**, OutputData>>;

  size_t pending_pinned_byte_size_;
  size_t pending_pinned_offset_;
  ResponsesList pending_pinned_outputs_;
  const bool copy_on_stream_;

  // Pinned memories that need to live over the lifetime of this
  // BackendOutputResponder object.
  std::list<char*> pinned_memories_;

  // Pinned memory buffers and the corresponding response outputs
  // where the final copy to the response is deferred until Finalize()
  // after waiting for all in-flight copies.
  struct DeferredPinned {
    DeferredPinned(
        char* pinned_memory, const size_t pinned_memory_size,
        ResponsesList&& responses)
        : pinned_memory_(pinned_memory),
          pinned_memory_size_(pinned_memory_size),
          responses_(std::move(responses))
    {
    }
    char* pinned_memory_;
    const size_t pinned_memory_size_;
    ResponsesList responses_;
  };

  std::list<DeferredPinned> deferred_pinned_;
};

}}  // namespace triton::backend