#include #include #ifdef USE_CUDA #include #endif namespace torio::io { //////////////////////////////////////////////////////////////////////////////// // Audio //////////////////////////////////////////////////////////////////////////////// template AudioConverter::AudioConverter(int c) : num_channels(c) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels > 0); } template torch::Tensor AudioConverter::convert(const AVFrame* src) { if constexpr (is_planar) { torch::Tensor dst = torch::empty({num_channels, src->nb_samples}, dtype); convert(src, dst); return dst.permute({1, 0}); } else { torch::Tensor dst = torch::empty({src->nb_samples, num_channels}, dtype); convert(src, dst); return dst; } } // Converts AVFrame* into pre-allocated Tensor. // The shape must be [C, T] if is_planar otherwise [T, C] template void AudioConverter::convert( const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels == src->channels); constexpr int bps = []() { switch (dtype) { case torch::kUInt8: return 1; case torch::kInt16: return 2; case torch::kInt32: case torch::kFloat32: return 4; case torch::kInt64: case torch::kFloat64: return 8; } }(); // Note // FFMpeg's `nb_samples` represnts the number of samples par channel. // whereas, in torchaudio, `num_samples` is used to represent the number of // samples across channels. torchaudio uses `num_frames` for per-channel // samples. if constexpr (is_planar) { int plane_size = bps * src->nb_samples; uint8_t* p_dst = static_cast(dst.data_ptr()); for (int i = 0; i < num_channels; ++i) { memcpy(p_dst, src->extended_data[i], plane_size); p_dst += plane_size; } } else { int plane_size = bps * src->nb_samples * num_channels; memcpy(dst.data_ptr(), src->extended_data[0], plane_size); } } // Explicit instantiation template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; template class AudioConverter; //////////////////////////////////////////////////////////////////////////////// // Image //////////////////////////////////////////////////////////////////////////////// namespace { torch::Tensor get_image_buffer( at::IntArrayRef shape, const torch::Dtype dtype = torch::kUInt8) { return torch::empty( shape, torch::TensorOptions().dtype(dtype).layout(torch::kStrided)); } #ifdef USE_CUDA torch::Tensor get_image_buffer( at::IntArrayRef shape, torch::Device device, const torch::Dtype dtype = torch::kUInt8) { return torch::empty( shape, torch::TensorOptions() .dtype(dtype) .layout(torch::kStrided) .device(device)); } #endif // USE_CUDA } // namespace ImageConverterBase::ImageConverterBase(int h, int w, int c) : height(h), width(w), num_channels(c) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(height > 0); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(width > 0); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(num_channels > 0); } //////////////////////////////////////////////////////////////////////////////// // Interlaced Image //////////////////////////////////////////////////////////////////////////////// void InterlacedImageConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == height); int stride = width * num_channels; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) * dst.size(3) == stride); auto p_dst = dst.data_ptr(); uint8_t* p_src = src->data[0]; for (int i = 0; i < height; ++i) { memcpy(p_dst, p_src, stride); p_src += src->linesize[0]; p_dst += stride; } } torch::Tensor InterlacedImageConverter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, height, width, num_channels}); convert(src, buffer); return buffer.permute({0, 3, 1, 2}); } //////////////////////////////////////////////////////////////////////////////// // Interlaced 16 Bit Image //////////////////////////////////////////////////////////////////////////////// void Interlaced16BitImageConverter::convert( const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == height); int stride = width * num_channels; TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) * dst.size(3) == stride); auto p_dst = dst.data_ptr(); uint8_t* p_src = src->data[0]; for (int i = 0; i < height; ++i) { memcpy(p_dst, p_src, stride * 2); p_src += src->linesize[0]; p_dst += stride; } // correct for int16 dst += 32768; } torch::Tensor Interlaced16BitImageConverter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, height, width, num_channels}, torch::kInt16); convert(src, buffer); return buffer.permute({0, 3, 1, 2}); } //////////////////////////////////////////////////////////////////////////////// // Planar Image //////////////////////////////////////////////////////////////////////////////// void PlanarImageConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == num_channels); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); for (int i = 0; i < num_channels; ++i) { torch::Tensor plane = dst.index({0, i}); uint8_t* p_dst = plane.data_ptr(); uint8_t* p_src = src->data[i]; int linesize = src->linesize[i]; for (int h = 0; h < height; ++h) { memcpy(p_dst, p_src, width); p_src += linesize; p_dst += width; } } } torch::Tensor PlanarImageConverter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); convert(src, buffer); return buffer; } //////////////////////////////////////////////////////////////////////////////// // YUV420P //////////////////////////////////////////////////////////////////////////////// YUV420PConverter::YUV420PConverter(int h, int w) : ImageConverterBase(h, w, 3) { TORCH_WARN_ONCE( "The output format YUV420P is selected. " "This will be implicitly converted to YUV444P, " "in which all the color components Y, U, V have the same dimension."); } void YUV420PConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( (AVPixelFormat)(src->format) == AV_PIX_FMT_YUV420P); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); // Write Y plane directly { uint8_t* p_dst = dst.data_ptr(); uint8_t* p_src = src->data[0]; for (int h = 0; h < height; ++h) { memcpy(p_dst, p_src, width); p_dst += width; p_src += src->linesize[0]; } } // Chroma (U and V planes) are subsamapled by 2 in both vertical and // holizontal directions. // https://en.wikipedia.org/wiki/Chroma_subsampling // Since we are returning data in Tensor, which has the same size for all // color planes, we need to upsample the UV planes. PyTorch has interpolate // function but it does not work for int16 type. So we manually copy them. // // block1 block2 block3 block4 // ab -> aabb = a b * a b * * // cd aabb a b a b // ccdd c d c d // ccdd c d c d // auto block00 = dst.slice(2, 0, {}, 2).slice(3, 0, {}, 2); auto block01 = dst.slice(2, 0, {}, 2).slice(3, 1, {}, 2); auto block10 = dst.slice(2, 1, {}, 2).slice(3, 0, {}, 2); auto block11 = dst.slice(2, 1, {}, 2).slice(3, 1, {}, 2); for (int i = 1; i < 3; ++i) { // borrow data auto tmp = torch::from_blob( src->data[i], {height / 2, width / 2}, {src->linesize[i], 1}, [](void*) {}, torch::TensorOptions().dtype(torch::kUInt8).layout(torch::kStrided)); // Copy to each block block00.slice(1, i, i + 1).copy_(tmp); block01.slice(1, i, i + 1).copy_(tmp); block10.slice(1, i, i + 1).copy_(tmp); block11.slice(1, i, i + 1).copy_(tmp); } } torch::Tensor YUV420PConverter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); convert(src, buffer); return buffer; } //////////////////////////////////////////////////////////////////////////////// // YUV420P10LE //////////////////////////////////////////////////////////////////////////////// YUV420P10LEConverter::YUV420P10LEConverter(int h, int w) : ImageConverterBase(h, w, 3) { TORCH_WARN_ONCE( "The output format YUV420PLE is selected. " "This will be implicitly converted to YUV444P (16-bit), " "in which all the color components Y, U, V have the same dimension."); } void YUV420P10LEConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( (AVPixelFormat)(src->format) == AV_PIX_FMT_YUV420P10LE); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kInt16); // Write Y plane directly { int16_t* p_dst = dst.data_ptr(); uint8_t* p_src = src->data[0]; for (int h = 0; h < height; ++h) { memcpy(p_dst, p_src, (size_t)width * 2); p_dst += width; p_src += src->linesize[0]; } } // Chroma (U and V planes) are subsamapled by 2 in both vertical and // holizontal directions. // https://en.wikipedia.org/wiki/Chroma_subsampling // Since we are returning data in Tensor, which has the same size for all // color planes, we need to upsample the UV planes. PyTorch has interpolate // function but it does not work for int16 type. So we manually copy them. // // block1 block2 block3 block4 // ab -> aabb = a b * a b * * // cd aabb a b a b // ccdd c d c d // ccdd c d c d // auto block00 = dst.slice(2, 0, {}, 2).slice(3, 0, {}, 2); auto block01 = dst.slice(2, 0, {}, 2).slice(3, 1, {}, 2); auto block10 = dst.slice(2, 1, {}, 2).slice(3, 0, {}, 2); auto block11 = dst.slice(2, 1, {}, 2).slice(3, 1, {}, 2); for (int i = 1; i < 3; ++i) { // borrow data auto tmp = torch::from_blob( src->data[i], {height / 2, width / 2}, {src->linesize[i] / 2, 1}, [](void*) {}, torch::TensorOptions().dtype(torch::kInt16).layout(torch::kStrided)); // Copy to each block block00.slice(1, i, i + 1).copy_(tmp); block01.slice(1, i, i + 1).copy_(tmp); block10.slice(1, i, i + 1).copy_(tmp); block11.slice(1, i, i + 1).copy_(tmp); } } torch::Tensor YUV420P10LEConverter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}, torch::kInt16); convert(src, buffer); return buffer; } //////////////////////////////////////////////////////////////////////////////// // NV12 //////////////////////////////////////////////////////////////////////////////// NV12Converter::NV12Converter(int h, int w) : ImageConverterBase(h, w, 3) { TORCH_WARN_ONCE( "The output format NV12 is selected. " "This will be implicitly converted to YUV444P, " "in which all the color components Y, U, V have the same dimension."); } void NV12Converter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( (AVPixelFormat)(src->format) == AV_PIX_FMT_NV12); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); // Write Y plane directly { uint8_t* p_dst = dst.data_ptr(); uint8_t* p_src = src->data[0]; for (int h = 0; h < height; ++h) { memcpy(p_dst, p_src, width); p_dst += width; p_src += src->linesize[0]; } } // Write intermediate UV plane { auto tmp = torch::from_blob( src->data[1], {height / 2, width}, {src->linesize[1], 1}, [](void*) {}, torch::TensorOptions().dtype(torch::kUInt8).layout(torch::kStrided)); tmp = tmp.view({1, height / 2, width / 2, 2}).permute({0, 3, 1, 2}); auto dst_uv = dst.slice(1, 1, 3); dst_uv.slice(2, 0, {}, 2).slice(3, 0, {}, 2).copy_(tmp); dst_uv.slice(2, 0, {}, 2).slice(3, 1, {}, 2).copy_(tmp); dst_uv.slice(2, 1, {}, 2).slice(3, 0, {}, 2).copy_(tmp); dst_uv.slice(2, 1, {}, 2).slice(3, 1, {}, 2).copy_(tmp); } } torch::Tensor NV12Converter::convert(const AVFrame* src) { torch::Tensor buffer = get_image_buffer({1, num_channels, height, width}); convert(src, buffer); return buffer; } #ifdef USE_CUDA CudaImageConverterBase::CudaImageConverterBase(const torch::Device& device) : device(device) {} //////////////////////////////////////////////////////////////////////////////// // NV12 CUDA //////////////////////////////////////////////////////////////////////////////// NV12CudaConverter::NV12CudaConverter(const torch::Device& device) : CudaImageConverterBase(device) { TORCH_WARN_ONCE( "The output format NV12 is selected. " "This will be implicitly converted to YUV444P, " "in which all the color components Y, U, V have the same dimension."); } void NV12CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kUInt8); auto fmt = (AVPixelFormat)(src->format); AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; AVPixelFormat sw_fmt = hwctx->sw_format; TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_NV12 == sw_fmt, "Expected NV12 format. Found: ", av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( dst.data_ptr(), width, src->data[0], src->linesize[0], width, height, cudaMemcpyDeviceToDevice); TORCH_CHECK(cudaSuccess == status, "Failed to copy Y plane to Cuda tensor."); // Preapare intermediate UV planes status = cudaMemcpy2D( tmp_uv.data_ptr(), width, src->data[1], src->linesize[1], width, height / 2, cudaMemcpyDeviceToDevice); TORCH_CHECK(cudaSuccess == status, "Failed to copy UV plane to Cuda tensor."); // Upsample width and height namespace F = torch::nn::functional; torch::Tensor uv = F::interpolate( tmp_uv.permute({0, 3, 1, 2}), F::InterpolateFuncOptions() .mode(torch::kNearest) .size(std::vector({height, width}))); // Write to the UV plane // dst[:, 1:] = uv using namespace torch::indexing; dst.index_put_({Slice(), Slice(1)}, uv); } torch::Tensor NV12CudaConverter::convert(const AVFrame* src) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); if (!init) { height = src->height; width = src->width; tmp_uv = get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kUInt8); init = true; } torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device); convert(src, buffer); return buffer; } //////////////////////////////////////////////////////////////////////////////// // P010 CUDA //////////////////////////////////////////////////////////////////////////////// P010CudaConverter::P010CudaConverter(const torch::Device& device) : CudaImageConverterBase{device} { TORCH_WARN_ONCE( "The output format P010 is selected. " "This will be implicitly converted to YUV444P, " "in which all the color components Y, U, V have the same dimension."); } void P010CudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kInt16); auto fmt = (AVPixelFormat)(src->format); AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; AVPixelFormat sw_fmt = hwctx->sw_format; TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_P010 == sw_fmt, "Expected P010 format. Found: ", av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly auto status = cudaMemcpy2D( dst.data_ptr(), width * 2, src->data[0], src->linesize[0], width * 2, height, cudaMemcpyDeviceToDevice); TORCH_CHECK(cudaSuccess == status, "Failed to copy Y plane to CUDA tensor."); // Prepare intermediate UV planes status = cudaMemcpy2D( tmp_uv.data_ptr(), width * 2, src->data[1], src->linesize[1], width * 2, height / 2, cudaMemcpyDeviceToDevice); TORCH_CHECK(cudaSuccess == status, "Failed to copy UV plane to CUDA tensor."); // Write to the UV plane torch::Tensor uv = tmp_uv.permute({0, 3, 1, 2}); using namespace torch::indexing; // very simplistic upscale using indexing since interpolate doesn't support // shorts dst.index_put_( {Slice(), Slice(1, 3), Slice(None, None, 2), Slice(None, None, 2)}, uv); dst.index_put_( {Slice(), Slice(1, 3), Slice(1, None, 2), Slice(None, None, 2)}, uv); dst.index_put_( {Slice(), Slice(1, 3), Slice(None, None, 2), Slice(1, None, 2)}, uv); dst.index_put_( {Slice(), Slice(1, 3), Slice(1, None, 2), Slice(1, None, 2)}, uv); // correct for int16 dst += 32768; } torch::Tensor P010CudaConverter::convert(const AVFrame* src) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); if (!init) { height = src->height; width = src->width; tmp_uv = get_image_buffer({1, height / 2, width / 2, 2}, device, torch::kInt16); init = true; } torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device, torch::kInt16); convert(src, buffer); return buffer; } //////////////////////////////////////////////////////////////////////////////// // YUV444P CUDA //////////////////////////////////////////////////////////////////////////////// YUV444PCudaConverter::YUV444PCudaConverter(const torch::Device& device) : CudaImageConverterBase(device) {} void YUV444PCudaConverter::convert(const AVFrame* src, torch::Tensor& dst) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->height == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src->width == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(1) == 3); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(2) == height); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.size(3) == width); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dst.dtype() == torch::kUInt8); auto fmt = (AVPixelFormat)(src->format); AVHWFramesContext* hwctx = (AVHWFramesContext*)src->hw_frames_ctx->data; AVPixelFormat sw_fmt = hwctx->sw_format; TORCH_INTERNAL_ASSERT( AV_PIX_FMT_CUDA == fmt, "Expected CUDA frame. Found: ", av_get_pix_fmt_name(fmt)); TORCH_INTERNAL_ASSERT( AV_PIX_FMT_YUV444P == sw_fmt, "Expected YUV444P format. Found: ", av_get_pix_fmt_name(sw_fmt)); // Write Y plane directly for (int i = 0; i < 3; ++i) { auto status = cudaMemcpy2D( dst.index({0, i}).data_ptr(), width, src->data[i], src->linesize[i], width, height, cudaMemcpyDeviceToDevice); TORCH_CHECK( cudaSuccess == status, "Failed to copy plane ", i, " to CUDA tensor."); } } torch::Tensor YUV444PCudaConverter::convert(const AVFrame* src) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(src); if (!init) { height = src->height; width = src->width; init = true; } torch::Tensor buffer = get_image_buffer({1, 3, height, width}, device); convert(src, buffer); return buffer; } #endif } // namespace torio::io